Blame - src/core/CL/cl_kernels/winograd_output_transform.cl - ml/ComputeLibrary

2018-08-31 16:10:16 +0100

[diff] [blame]

42

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

43

* @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu

44

* @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.

45

* @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4)

46

* @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=int

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

47

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

48

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

49

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

50

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

51

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

52

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

53

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

54

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

55

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

56

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

57

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

58

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

59

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

60

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

61

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

62

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

63

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

64

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

65

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

66

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

67

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

68

*/

69

__kernel void winograd_output_transform_2x2_3x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

70

TENSOR4D_DECLARATION(src),

71

TENSOR4D_DECLARATION(dst)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

72

#if defined(HAS_BIAS)

73

,

74

VECTOR_DECLARATION(bias)

75

#endif // defined(HAS_BIAS)

76

)

77

{

78

// Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

79

#if defined(SRC_DEPTH)

80

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

81

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

82

#else /* defined(SRC_DEPTH) */

83

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

84

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

85

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

86

87

// Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

88

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

89

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

90

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

91

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

92

93

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

94

// Compute the 2x1 or 1x2 output tile

95

// out00 = d00 + d01 + d02

96

// out01 = d01 - d02 - d03

97

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

98

float out00 = d00 + d01 + d02;

99

float out01 = d01 - d02 - d03;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

100

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

101

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

102

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

103

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

104

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

105

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

106

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

107

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

108

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

109

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

110

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

111

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

112

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

113

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

114

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

115

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

116

117

// Compute the 2x2 output tile

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

118

float k0 = d01 + d11 + d21;

119

float k1 = d02 + d12 + d22;

120

float k2 = d11 - d21 - d31;

121

float k3 = d12 - d22 - d32;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

122

123

// out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22

124

// out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)

125

// out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)

126

// out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)

127

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

float out00 = d10;

float out01 = -d13;

float out10 = d10;

float out11 = -d13;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

132

133

out00 += d00 + d20 + k0 + k1;

134

out01 += k0 - k1 - (d03 + d23);

135

out10 += -d20 - d30 + k2 + k3;

136

out11 += k2 - k3 + d23 + d33;

137

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

138

139

int y_in = get_global_id(1);

140

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

141

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

142

int z_out = get_global_id(0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

143

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

144

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

145

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

146

147

#if defined(HAS_BIAS)

148

// Add bias

149

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

150

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

151

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

152

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

153

out00 += (float)b;

154

out01 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

155

#endif // defined(HAS_BIAS)

156

157

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

158

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

159

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

160

#else /* defined(SRC_DEPTH) */

161

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

162

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

163

164

// Store the output tile

165

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

166

const const VEC_DATA_TYPE(DATA_TYPE, 2)

167

out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)));

168

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;

169

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

170

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

171

vstore2(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2))), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

172

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

173

174

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

175

#if defined(HAS_BIAS)

176

// Add bias

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

177

out10 += (DATA_TYPE)b;

178

out11 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

179

#endif // defined(HAS_BIAS)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

180

vstore2(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2))), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

181

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

182

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

183

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

184

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

185

#if defined(VEC_SIZE) && VEC_SIZE == 4

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

186

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW

187

*

188

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

189

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

190

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

191

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

192

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

193

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

194

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

195

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

196

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

197

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

198

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

199

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

200

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

201

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

202

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

203

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

204

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

205

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

206

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

207

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

208

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

209

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

210

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

211

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

212

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

213

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

214

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

215

*/

216

__kernel void winograd_output_transform_4x4_3x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

217

TENSOR4D_DECLARATION(src),

218

TENSOR4D_DECLARATION(dst)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

219

#if defined(HAS_BIAS)

220

,

221

VECTOR_DECLARATION(bias)

222

#endif // defined(HAS_BIAS)

223

)

224

{

225

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

226

#if defined(SRC_DEPTH)

227

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

228

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

229

#else /* defined(SRC_DEPTH) */

230

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

231

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

232

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

233

234

// Load the values across the channels to compose the 6x6 or 6x1 tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

235

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

236

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

237

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

238

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

239

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

240

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

241

242

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

243

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

244

float out00 = d00 + d01 + d02 + d03 + d04;

245

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;

246

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;

247

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

248

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

249

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

250

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

251

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

252

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

253

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

254

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

255

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

256

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

257

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

258

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

259

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

260

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

261

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

262

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

263

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

264

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

265

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

266

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

267

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

268

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

269

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

270

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

271

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

272

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

273

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

274

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

275

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

276

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

277

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

278

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

279

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

280

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

281

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

282

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

283

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

284

285

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

286

float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;

287

float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;

288

float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;

289

float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

290

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

291

float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;

292

float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

293

294

out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;

295

out01 += k1 - d02 - d12 - d22 - d32 - d42;

296

out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;

297

out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;

298

299

// Compute out10, out11, out12 and out13

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

300

float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

301

float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

302

float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

303

float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

304

305

k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;

306

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;

307

308

out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;

309

out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;

310

out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;

311

out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;

312

313

// Compute out20, out21, out22 and out23

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

314

float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

315

float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

316

float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

317

float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

318

319

k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;

320

k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;

321

322

out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;

323

out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;

324

out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;

325

out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;

326

327

// Compute out30, out31, out32 and out33

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

328

float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

329

float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

330

float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

331

float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

332

333

k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;

334

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;

335

336

out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;

337

out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;

338

out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;

339

out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;

340

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

341

342

int y_in = get_global_id(1);

343

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

344

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

345

int z_out = get_global_id(0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

346

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

347

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

348

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

349

350

#if defined(HAS_BIAS)

351

// Add bias

352

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

353

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

354

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

355

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out00 += (float)b;

out01 += (float)b;

out02 += (float)b;

out03 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

360

#endif // defined(HAS_BIAS)

361

362

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

363

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

364

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

365

#else /* defined(SRC_DEPTH) */

366

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

367

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

368

369

// Store the output tile

370

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

371

VEC_DATA_TYPE(DATA_TYPE, 4)

372

out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));

373

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;

374

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;

375

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;

376

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

377

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

378

vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

379

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

380

381

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

382

#if defined(HAS_BIAS)

383

// Add bias

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out10 += (float)b;

out11 += (float)b;

out12 += (float)b;

out13 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

388

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out20 += (float)b;

out21 += (float)b;

out22 += (float)b;

out23 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

393

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out30 += (float)b;

out31 += (float)b;

out32 += (float)b;

out33 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

398

#endif // defined(HAS_BIAS)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

399

vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

400

vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));

401

vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

402

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

403

}

404

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

405

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

406

*

407

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

408

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

409

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

410

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

411

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

412

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

413

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

414

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

415

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

416

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

417

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

418

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

419

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

420

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

421

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

422

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

423

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

424

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

425

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

426

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

427

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

428

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

429

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

430

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

431

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

432

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

433

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

434

* @param[in] dst_size Size of the destination tensor, minus the last padding

435

*/

436

__kernel void winograd_output_transform_4x4_3x3_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

437

TENSOR4D_DECLARATION(src),

438

TENSOR4D_DECLARATION(dst),

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

439

#if defined(HAS_BIAS)

440

VECTOR_DECLARATION(bias),

441

#endif // defined(HAS_BIAS)

442

int dst_size)

443

{

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

444

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

445

#if defined(SRC_DEPTH)

446

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

447

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

448

#else /* defined(SRC_DEPTH) */

449

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

450

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

451

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

452

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

453

// Load the values across the 36 channels to compose the 6x6 or 6x1 tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

454

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

455

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

456

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

457

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

458

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

459

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

460

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

461

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

462

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

463

float out00 = d00 + d01 + d02 + d03 + d04;

464

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;

465

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;

466

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

467

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

468

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

469

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

470

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

471

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

472

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

473

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

474

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

475

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

476

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

477

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

478

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

479

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

480

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

481

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

482

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

483

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

484

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

485

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

486

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

487

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

488

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

489

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

490

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

491

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

492

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

493

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

494

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

495

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

496

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

497

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

498

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

499

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

500

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

501

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

502

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

503

504

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

505

float out00 = d01 + d21 + d41 + d11 + d31;

506

float out01 = d01 + d21 + d41 + d11 + d31;

507

float out02 = d01 + d21 + d41 + d11 + d31;

508

float out03 = d01 + d21 + d41 + d11 + d31;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

509

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

510

float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;

511

float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

512

513

out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;

514

out01 += k1 - d02 - d12 - d22 - d32 - d42;

515

out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;

516

out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;

517

518

// Compute out10, out11, out12 and out13

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

519

float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

520

float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

521

float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

522

float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

523

524

k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;

525

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;

526

527

out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;

528

out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;

529

out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;

530

out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;

531

532

// Compute out20, out21, out22 and out23

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

533

float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

534

float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

535

float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

536

float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

537

538

k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;

539

k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;

540

541

out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;

542

out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;

543

out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;

544

out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;

545

546

// Compute out30, out31, out32 and out33

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

547

float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

548

float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

549

float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

550

float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

551

552

k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;

553

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;

554

555

out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;

556

out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;

557

out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;

558

out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

559

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

560

561

int y_in = get_global_id(1);

562

int x_out = get_global_id(0);

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

563

int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

564

int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

565

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

566

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

567

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

568

569

#if defined(HAS_BIAS)

570

// Add bias

571

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

572

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

573

DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

574

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

575

out00 += (DATA_TYPE)b;

576

out01 += (DATA_TYPE)b;

577

out02 += (DATA_TYPE)b;

578

out03 += (DATA_TYPE)b;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

579

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

580

out10 += (DATA_TYPE)b;

581

out11 += (DATA_TYPE)b;

582

out12 += (DATA_TYPE)b;

583

out13 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

584

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

585

out20 += (DATA_TYPE)b;

586

out21 += (DATA_TYPE)b;

587

out22 += (DATA_TYPE)b;

588

out23 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

589

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

590

out30 += (DATA_TYPE)b;

591

out31 += (DATA_TYPE)b;

592

out32 += (DATA_TYPE)b;

593

out33 += (DATA_TYPE)b;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

594

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

595

596

#endif // defined(HAS_BIAS)

597

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

598

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

599

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

600

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

601

#else /* defined(SRC_DEPTH) */

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

602

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

603

#endif /* defined(SRC_DEPTH) */

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

604

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

605

606

// Store the 1x4 output tile

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

607

VEC_DATA_TYPE(DATA_TYPE, 4)

608

out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));

609

*((__global DATA_TYPE *)(dst_ptr + offset.s0)) = out0_dt.s0;

610

*((__global DATA_TYPE *)(dst_ptr + offset.s1)) = out0_dt.s1;

611

*((__global DATA_TYPE *)(dst_ptr + offset.s2)) = out0_dt.s2;

612

*((__global DATA_TYPE *)(dst_ptr + offset.s3)) = out0_dt.s3;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

613

#elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

614

// Store the 4x1 output tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

615

int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

Giorgio Arena

d02eb45

2018-07-18 11:45:30 +0100

[diff] [blame]

616

int mult_y = min(dst_size - offset, 1);

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

617

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

618

VEC_DATA_TYPE(DATA_TYPE, 4)

619

out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));

620

*((__global DATA_TYPE *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = out0_dt.s0;

621

*((__global DATA_TYPE *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = out0_dt.s1;

622

*((__global DATA_TYPE *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = out0_dt.s2;

623

*((__global DATA_TYPE *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = out0_dt.s3;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

624

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

625

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

626

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

627

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

628

#else /* defined(SRC_DEPTH) */

629

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

630

#endif /* defined(SRC_DEPTH) */

Georgios Pinitas

b0b3717

2018-07-20 17:30:56 +0100

[diff] [blame]

631

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

632

int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

633

634

// Store the 4x4 output tile

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

635

VEC_DATA_TYPE(DATA_TYPE, 4)

636

out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));

637

VEC_DATA_TYPE(DATA_TYPE, 4)

638

out1_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)));

639

VEC_DATA_TYPE(DATA_TYPE, 4)

640

out2_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)));

641

VEC_DATA_TYPE(DATA_TYPE, 4)

642

out3_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4)));

643

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out0_dt.s0;

644

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out0_dt.s1;

645

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = out0_dt.s2;

646

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = out0_dt.s3;

647

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = out1_dt.s0;

648

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = out1_dt.s1;

649

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = out1_dt.s2;

650

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = out1_dt.s3;

651

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = out2_dt.s0;

652

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = out2_dt.s1;

653

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = out2_dt.s2;

654

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = out2_dt.s3;

655

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = out3_dt.s0;

656

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out3_dt.s1;

657

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out3_dt.s2;

658

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out3_dt.s3;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

659

660

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

661

}

662

663

#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \

664

({ \

665

comm_fact.s0 = d1 + d2; \

666

comm_fact.s1 = d3 + d4; \

667

comm_fact.s2 = d5 + d6; \

668

\

669

col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \

670

col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \

671

\

672

comm_fact.s0 = d1 - d2; \

673

comm_fact.s1 = d3 - d4; \

674

comm_fact.s2 = d5 - d6; \

675

\

676

col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \

677

col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \

678

})

679

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

680

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

681

*

682

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

683

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

684

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

685

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

686

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

687

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

688

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

689

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

690

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

691

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

692

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

693

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

694

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

695

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

696

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

697

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

698

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

699

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

700

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

701

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

702

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

703

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

704

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

705

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

706

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

707

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

708

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

709

*/

710

__kernel void winograd_output_transform_4x4_5x5_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

711

TENSOR4D_DECLARATION(src),

712

TENSOR4D_DECLARATION(dst)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

713

#if defined(HAS_BIAS)

714

,

715

VECTOR_DECLARATION(bias)

716

#endif // defined(HAS_BIAS)

717

)

718

{

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

719

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

720

#if defined(SRC_DEPTH)

721

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

722

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

723

#else /* defined(SRC_DEPTH) */

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

724

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

725

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

726

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

727

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

728

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

729

// Compute output address

730

int y_in = get_global_id(1);

731

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

732

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

733

int z_out = get_global_id(0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

734

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

735

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

736

#endif /* defined(SRC_DEPTH) */

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

737

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

738

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

739

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

740

#else /* defined(SRC_DEPTH) */

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

741

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

742

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

743

#endif /* defined(SRC_DEPTH) */

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

744

745

// Load the values across the channels to compose the input tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

746

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

747

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

748

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

749

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

750

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

751

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

752

DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

753

DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

754

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

755

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

756

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

757

float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;

758

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;

759

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;

760

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

761

762

#if defined(HAS_BIAS)

763

// Add bias

764

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

765

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

766

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

767

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

768

out00 += (DATA_TYPE)b;

769

out01 += (DATA_TYPE)b;

770

out02 += (DATA_TYPE)b;

771

out03 += (DATA_TYPE)b;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

772

#endif // defined(HAS_BIAS)

773

774

// Store the output tile

775

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

776

VEC_DATA_TYPE(DATA_TYPE, 4)

777

out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));

778

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;

779

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;

780

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;

781

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

782

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

783

vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

784

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

785

786

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

787

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

788

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

789

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

790

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

791

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

792

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

793

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

794

DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

795

DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

796

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

797

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

798

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

799

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

800

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

801

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

802

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

803

DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

804

DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

805

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

806

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

807

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

808

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

809

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

810

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

811

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

812

DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

813

DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

814

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

815

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

816

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

817

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

818

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

819

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));

820

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));

821

DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));

822

DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

823

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

824

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));

825

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));

826

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));

827

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));

828

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));

829

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));

830

DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));

831

DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

832

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

833

DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));

834

DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));

835

DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));

836

DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));

837

DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));

838

DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));

839

DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));

840

DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

841

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

842

DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));

843

DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));

844

DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));

845

DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));

846

DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));

847

DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));

848

DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));

849

DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

850

851

// Compute the 8x4 intermediate tensor

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

852

VEC_DATA_TYPE(float, 4)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

853

comm_fact0, comm_fact1, comm_fact2;

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

854

VEC_DATA_TYPE(float, 4)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

855

tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

856

857

COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);

858

COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);

859

COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);

860

COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);

861

COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);

862

COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);

863

COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);

864

COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);

865

866

// Compute the 4x4 output tile

867

comm_fact0 = tmp_col1 + tmp_col2;

868

comm_fact1 = tmp_col3 + tmp_col4;

869

comm_fact2 = tmp_col5 + tmp_col6;

870

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

871

VEC_DATA_TYPE(float, 4)

872

out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0;

873

VEC_DATA_TYPE(float, 4)

874

out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

875

876

comm_fact0 = tmp_col1 - tmp_col2;

877

comm_fact1 = tmp_col3 - tmp_col4;

878

comm_fact2 = tmp_col5 - tmp_col6;

879

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

880

VEC_DATA_TYPE(float, 4)

881

out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2;

882

VEC_DATA_TYPE(float, 4)

883

out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

884

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

885

#if defined(HAS_BIAS)

886

// Add bias

887

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

888

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

889

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

890

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

891

out_col0 += (VEC_DATA_TYPE(float, 4))b;

892

out_col1 += (VEC_DATA_TYPE(float, 4))b;

893

out_col2 += (VEC_DATA_TYPE(float, 4))b;

894

out_col3 += (VEC_DATA_TYPE(float, 4))b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

895

#endif // defined(HAS_BIAS)

896

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

897

// Store the output tile

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

898

vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0)), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

899

vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1)), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

900

vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2)), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));

901

vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3)), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

902

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

903

}

904

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

905

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

906

*

907

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

908

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

909

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

910

* @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

911

* @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

912

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

913

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

914

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

915

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

916

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

917

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

918

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

919

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

920

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

921

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

922

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

923

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

924

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

925

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

926

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

927

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

928

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

929

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

930

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

931

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

932

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

933

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

934

*/

935

__kernel void winograd_output_transform_4x4_5x5_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

936

TENSOR4D_DECLARATION(src),

937

TENSOR4D_DECLARATION(dst),

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

938

#if defined(HAS_BIAS)

939

VECTOR_DECLARATION(bias),

940

#endif // defined(HAS_BIAS)

941

int dst_size)

942

{

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

943

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

944

#if defined(SRC_DEPTH)

945

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

946

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

947

#else /* defined(SRC_DEPTH) */

948

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

949

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

950

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

951

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

952

int y_in = get_global_id(1);

953

int x_out = get_global_id(0);

954

int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

955

int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

956

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

957

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

958

#endif /* defined(SRC_DEPTH) */

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

959

960

// Load the values across the channels to compose the input tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

961

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

962

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

963

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

964

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

965

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

966

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

967

DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

968

DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

969

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

970

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

971

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

972

float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;

973

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;

974

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;

975

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

976

977

#if defined(HAS_BIAS)

978

// Add bias

979

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

980

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

981

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

982

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out00 += (float)b;

out01 += (float)b;

out02 += (float)b;

out03 += (float)b;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

987

#endif // defined(HAS_BIAS)

988

989

// Store the output tile

990

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

991

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

992

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

993

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

994

#else /* defined(SRC_DEPTH) */

995

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

996

#endif /* defined(SRC_DEPTH) */

997

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

998

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

999

VEC_DATA_TYPE(DATA_TYPE, 4)

1000

out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));

1001

*(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;

1002

*(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;

1003

*(__global DATA_TYPE *)(dst_ptr + offset.s2) = out0_dt.s2;

1004

*(__global DATA_TYPE *)(dst_ptr + offset.s3) = out0_dt.s3;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1005

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1006

// Get output address

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1007

int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1008

VEC_DATA_TYPE(DATA_TYPE, 4)

1009

out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));

1010

*(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;

1011

*(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;

1012

*(__global DATA_TYPE *)(dst_ptr + 2 * dst_stride_y + offset) = out0_dt.s2;

1013

*(__global DATA_TYPE *)(dst_ptr + 3 * dst_stride_y + offset) = out0_dt.s3;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1014

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1015

1016

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1017

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1018

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

1019

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

1020

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

1021

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

1022

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

1023

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

1024

DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

1025

DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1026

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1027

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

1028

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

1029

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

1030

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

1031

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

1032

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

1033

DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

1034

DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1035

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1036

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

1037

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

1038

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

1039

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

1040

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

1041

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

1042

DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

1043

DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1044

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1045

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

1046

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

1047

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

1048

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

1049

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));

1050

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));

1051

DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));

1052

DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1053

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1054

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));

1055

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));

1056

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));

1057

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));

1058

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));

1059

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));

1060

DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));

1061

DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1062

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1063

DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));

1064

DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));

1065

DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));

1066

DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));

1067

DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));

1068

DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));

1069

DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));

1070

DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1071

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1072

DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));

1073

DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));

1074

DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));

1075

DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));

1076

DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));

1077

DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));

1078

DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));

1079

DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1080

1081

// Compute the 8x4 intermediate tensor

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1082

VEC_DATA_TYPE(float, 4)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1083

comm_fact0, comm_fact1, comm_fact2;

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1084

VEC_DATA_TYPE(float, 4)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1085

tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1086

1087

COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);

1088

COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);

1089

COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);

1090

COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);

1091

COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);

1092

COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);

1093

COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);

1094

COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);

1095

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1096

// Compute the output tile

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1097

comm_fact0 = tmp_col1 + tmp_col2;

1098

comm_fact1 = tmp_col3 + tmp_col4;

1099

comm_fact2 = tmp_col5 + tmp_col6;

1100

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1101

VEC_DATA_TYPE(float, 4)

1102

out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;

1103

VEC_DATA_TYPE(float, 4)

1104

out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1105

1106

comm_fact0 = tmp_col1 - tmp_col2;

1107

comm_fact1 = tmp_col3 - tmp_col4;

1108

comm_fact2 = tmp_col5 - tmp_col6;

1109

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1110

VEC_DATA_TYPE(float, 4)

1111

out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;

1112

VEC_DATA_TYPE(float, 4)

1113

out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1114

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1115

#if defined(HAS_BIAS)

1116

// Add bias

1117

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

1118

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1119

DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1120

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1121

out_col0 += (VEC_DATA_TYPE(float, 4))b;

1122

out_col1 += (VEC_DATA_TYPE(float, 4))b;

1123

out_col2 += (VEC_DATA_TYPE(float, 4))b;

1124

out_col3 += (VEC_DATA_TYPE(float, 4))b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1125

#endif // defined(HAS_BIAS)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1126

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1127

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1128

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1129

#else /* defined(SRC_DEPTH) */

1130

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

1131

#endif /* defined(SRC_DEPTH) */

Georgios Pinitas

b0b3717

2018-07-20 17:30:56 +0100

[diff] [blame]

1132

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

1133

int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1134

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1135

// Store the output tile

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1136

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

1137

out_col0_dt = ACTIVATION_FUNC(CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));

1138

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

1139

out_col1_dt = ACTIVATION_FUNC(CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));

1140

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

1141

out_col2_dt = ACTIVATION_FUNC(CONVERT(out_col2, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));

1142

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

1143

out_col3_dt = ACTIVATION_FUNC(CONVERT(out_col3, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));

1144

1145

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;

1146

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;

1147

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * (int)dst_stride_y + offset.s0) = out_col2_dt.s0;

1148

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * (int)dst_stride_y + offset.s0) = out_col3_dt.s0;

1149

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0_dt.s1;

1150

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;

1151

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * (int)dst_stride_y + offset.s1) = out_col2_dt.s1;

1152

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * (int)dst_stride_y + offset.s1) = out_col3_dt.s1;

1153

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * (int)dst_stride_y + offset.s2) = out_col0_dt.s2;

1154

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * (int)dst_stride_y + offset.s2) = out_col1_dt.s2;

1155

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * (int)dst_stride_y + offset.s2) = out_col2_dt.s2;

1156

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * (int)dst_stride_y + offset.s2) = out_col3_dt.s2;

1157

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * (int)dst_stride_y + offset.s3) = out_col0_dt.s3;

1158

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * (int)dst_stride_y + offset.s3) = out_col1_dt.s3;

1159

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * (int)dst_stride_y + offset.s3) = out_col2_dt.s3;

1160

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * (int)dst_stride_y + offset.s3) = out_col3_dt.s3;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1161

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1162

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1163

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1164

1165

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1166

#if defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1167

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW

1168

*

1169

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1170

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

1171

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1172

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1173

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1174

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1175

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1176

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1177

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1178

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1179

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1180

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1181

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1182

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1183

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1184

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1185

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1186

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1187

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1188

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1189

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1190

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1191

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1192

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1193

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1194

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1195

*/

1196

__kernel void winograd_output_transform_2x1_3x1_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1197

TENSOR4D_DECLARATION(src),

1198

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1199

#if defined(HAS_BIAS)

1200

,

1201

VECTOR_DECLARATION(bias)

1202

#endif // defined(HAS_BIAS)

1203

)

1204

{

1205

winograd_output_transform_2x2_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1212

src_stride_w,

1213

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1214

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1222

dst_stride_w,

1223

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1224

dst_offset_first_element_in_bytes

1225

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1231

#endif // defined(HAS_BIAS)

1232

);

1233

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1234

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1235

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1236

#if defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1237

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW

1238

*

1239

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1240

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1241

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1242

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1243

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1244

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1245

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1246

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1247

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1248

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1249

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1250

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1251

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1252

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1253

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1254

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1255

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1256

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1257

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1258

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1259

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1260

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1261

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1262

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1263

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1264

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1265

*/

1266

__kernel void winograd_output_transform_4x1_3x1_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1267

TENSOR4D_DECLARATION(src),

1268

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1269

#if defined(HAS_BIAS)

1270

,

1271

VECTOR_DECLARATION(bias)

1272

#endif // defined(HAS_BIAS)

1273

)

1274

{

1275

winograd_output_transform_4x4_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1282

src_stride_w,

1283

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1284

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1292

dst_stride_w,

1293

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1294

dst_offset_first_element_in_bytes

1295

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1301

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW

1306

*

1307

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1308

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1309

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1310

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1311

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1312

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1313

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1314

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1315

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1316

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1317

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1318

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1319

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1320

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1321

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1322

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1323

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1324

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1325

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1326

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1327

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1328

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1329

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1330

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1331

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1332

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1333

*/

1334

__kernel void winograd_output_transform_4x1_5x1_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1335

TENSOR4D_DECLARATION(src),

1336

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1337

#if defined(HAS_BIAS)

1338

,

1339

VECTOR_DECLARATION(bias)

1340

#endif // defined(HAS_BIAS)

1341

)

1342

{

1343

winograd_output_transform_4x4_5x5_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1350

src_stride_w,

1351

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1352

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1360

dst_stride_w,

1361

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1362

dst_offset_first_element_in_bytes

1363

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1369

#endif // defined(HAS_BIAS)

1370

);

1371

}

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1372

1373

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC

1374

*

1375

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1376

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1377

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1378

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1379

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1380

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1381

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1382

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1383

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1384

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1385

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1386

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1387

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1388

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1389

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1390

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1391

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1392

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1393

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1394

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1395

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1396

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1397

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1398

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1399

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1400

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1401

*/

1402

__kernel void winograd_output_transform_4x1_3x1_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1403

TENSOR4D_DECLARATION(src),

1404

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1405

#if defined(HAS_BIAS)

1406

VECTOR_DECLARATION(bias),

1407

#endif // defined(HAS_BIAS)

1408

int dst_size)

1409

{

1410

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1417

src_stride_w,

1418

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1419

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1427

dst_stride_w,

1428

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1429

dst_offset_first_element_in_bytes,

1430

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1435

#endif // defined(HAS_BIAS)

dst_size);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC

1440

*

1441

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1442

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1443

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1444

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1445

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1446

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1447

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1448

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1449

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1450

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1451

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1452

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1453

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1454

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1455

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1456

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1457

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1458

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1459

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1460

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1461

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1462

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1463

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1464

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1465

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1466

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1467

*/

1468

__kernel void winograd_output_transform_4x1_5x1_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1469

TENSOR4D_DECLARATION(src),

1470

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1471

#if defined(HAS_BIAS)

1472

VECTOR_DECLARATION(bias),

1473

#endif // defined(HAS_BIAS)

1474

int dst_size)

1475

{

1476

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1483

src_stride_w,

1484

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1485

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1493

dst_stride_w,

1494

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1495

dst_offset_first_element_in_bytes,

1496

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1501

#endif // defined(HAS_BIAS)

1502

dst_size);

1503

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1504

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1505

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

1506

1507

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1508

#if defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1509

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW

1510

*

1511

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1512

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1513

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

1514

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1515

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1516

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1517

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1518

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1519

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1520

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1521

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1522

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1523

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1524

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1525

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1526

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1527

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1528

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1529

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1530

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1531

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1532

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1533

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1534

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1535

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1536

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1537

*/

1538

__kernel void winograd_output_transform_1x2_1x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1539

TENSOR4D_DECLARATION(src),

1540

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1541

#if defined(HAS_BIAS)

1542

,

1543

VECTOR_DECLARATION(bias)

1544

#endif // defined(HAS_BIAS)

1545

)

1546

{

1547

winograd_output_transform_2x2_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1554

src_stride_w,

1555

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1556

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1564

dst_stride_w,

1565

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1566

dst_offset_first_element_in_bytes

1567

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1573

#endif // defined(HAS_BIAS)

1574

);

1575

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1576

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1577

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1578

#if defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1579

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW

1580

*

1581

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1582

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1583

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1584

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1585

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1586

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1587

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1588

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1589

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1590

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1591

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1592

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1593

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1594

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1595

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1596

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1597

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1598

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1599

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1600

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1601

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1602

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1603

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1604

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1605

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1606

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1607

*/

1608

__kernel void winograd_output_transform_1x4_1x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1609

TENSOR4D_DECLARATION(src),

1610

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1611

#if defined(HAS_BIAS)

1612

,

1613

VECTOR_DECLARATION(bias)

1614

#endif // defined(HAS_BIAS)

1615

)

1616

{

1617

winograd_output_transform_4x4_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1624

src_stride_w,

1625

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1626

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1634

dst_stride_w,

1635

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1636

dst_offset_first_element_in_bytes

1637

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1643

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW

1648

*

1649

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1650

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1651

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1652

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1653

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1654

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1655

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1656

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1657

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1658

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1659

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1660

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1661

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1662

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1663

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1664

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1665

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1666

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1667

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1668

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1669

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1670

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1671

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1672

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1673

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1674

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1675

*/

1676

__kernel void winograd_output_transform_1x4_1x5_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1677

TENSOR4D_DECLARATION(src),

1678

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1679

#if defined(HAS_BIAS)

1680

,

1681

VECTOR_DECLARATION(bias)

1682

#endif // defined(HAS_BIAS)

1683

)

1684

{

1685

winograd_output_transform_4x4_5x5_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1692

src_stride_w,

1693

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1694

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1702

dst_stride_w,

1703

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1704

dst_offset_first_element_in_bytes

1705

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1711

#endif // defined(HAS_BIAS)

1712

);

1713

}

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1714

1715

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC

1716

*

1717

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1718

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1719

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1720

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1721

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1722

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1723

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1724

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1725

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1726

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1727

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1728

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1729

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1730

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1731

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1732

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1733

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1734

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1735

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1736

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1737

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1738

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1739

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1740

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1741

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1742

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1743

*/

1744

__kernel void winograd_output_transform_1x4_1x3_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1745

TENSOR4D_DECLARATION(src),

1746

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1747

#if defined(HAS_BIAS)

1748

VECTOR_DECLARATION(bias),

1749

#endif // defined(HAS_BIAS)

1750

int dst_size)

1751

{

1752

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1759

src_stride_w,

1760

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1761

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1769

dst_stride_w,

1770

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1771

dst_offset_first_element_in_bytes,

1772

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1777

#endif // defined(HAS_BIAS)

dst_size);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC

1782

*

1783

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1784

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1785

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1786

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1787

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1788

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1789

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1790

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1791

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1792

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1793

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1794

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1795

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1796

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1797

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1798

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1799

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1800

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1801

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1802

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1803

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1804

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1805

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1806

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1807

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1808

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1809

*/

1810

__kernel void winograd_output_transform_1x4_1x5_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1811

TENSOR4D_DECLARATION(src),

1812

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1813

#if defined(HAS_BIAS)

1814

VECTOR_DECLARATION(bias),

1815

#endif // defined(HAS_BIAS)

1816

int dst_size)

1817

{

1818

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1825

src_stride_w,

1826

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1827

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1835

dst_stride_w,

1836

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1837

dst_offset_first_element_in_bytes,

1838

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1843

#endif // defined(HAS_BIAS)

1844

dst_size);

1845

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1846

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1847

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Georgios Pinitas