Blame - src/core/CL/cl_kernels/winograd_output_transform.cl - ml/ComputeLibrary

2018-08-31 16:10:16 +0100

[diff] [blame]

34

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

35

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

36

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

37

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

38

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

39

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

40

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

41

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

42

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

43

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

44

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

45

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

46

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

47

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

48

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

49

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

50

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

51

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

52

*/

53

__kernel void winograd_output_transform_2x2_3x3_nchw(

54

TENSOR3D_DECLARATION(src),

55

TENSOR3D_DECLARATION(dst)

56

#if defined(HAS_BIAS)

57

,

58

VECTOR_DECLARATION(bias)

59

#endif // defined(HAS_BIAS)

60

)

61

{

62

// Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size

63

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

64

65

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

66

67

// Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

68

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

69

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

70

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

71

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

72

73

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

74

// Compute the 2x1 or 1x2 output tile

75

// out00 = d00 + d01 + d02

76

// out01 = d01 - d02 - d03

77

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

78

DATA_TYPE out00 = d00 + d01 + d02;

79

DATA_TYPE out01 = d01 - d02 - d03;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

80

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

81

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

82

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

83

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

84

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

85

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

86

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

87

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

88

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

89

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

90

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

91

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

92

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

93

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

94

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

95

96

// Compute the 2x2 output tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

97

DATA_TYPE k0 = d01 + d11 + d21;

98

DATA_TYPE k1 = d02 + d12 + d22;

99

DATA_TYPE k2 = d11 - d21 - d31;

100

DATA_TYPE k3 = d12 - d22 - d32;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

101

102

// out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22

103

// out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)

104

// out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)

105

// out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)

106

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

107

DATA_TYPE out00 = d10;

108

DATA_TYPE out01 = -d13;

109

DATA_TYPE out10 = d10;

110

DATA_TYPE out11 = -d13;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

111

112

out00 += d00 + d20 + k0 + k1;

113

out01 += k0 - k1 - (d03 + d23);

114

out10 += -d20 - d30 + k2 + k3;

115

out11 += k2 - k3 + d23 + d33;

116

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

117

118

int y_in = get_global_id(1);

119

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

120

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

121

int z_out = get_global_id(0);

122

123

#if defined(HAS_BIAS)

124

// Add bias

125

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

126

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

127

DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

128

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

129

out00 += (DATA_TYPE)b;

130

out01 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

131

#endif // defined(HAS_BIAS)

132

133

// Get output address

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

134

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

135

136

// Store the output tile

137

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

138

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out00;

139

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out01;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

140

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

141

vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(out00, out01), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

142

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

143

144

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

145

#if defined(HAS_BIAS)

146

// Add bias

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

147

out10 += (DATA_TYPE)b;

148

out11 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

149

#endif // defined(HAS_BIAS)

150

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

151

vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(out10, out11), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

152

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

153

}

154

155

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW

156

*

157

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

158

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

159

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

160

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

161

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

162

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

163

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

164

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

165

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

166

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

167

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

168

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

169

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

170

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

171

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

172

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

173

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

174

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

175

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

176

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

177

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

178

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

179

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

180

*/

181

__kernel void winograd_output_transform_4x4_3x3_nchw(

182

TENSOR3D_DECLARATION(src),

183

TENSOR3D_DECLARATION(dst)

184

#if defined(HAS_BIAS)

185

,

186

VECTOR_DECLARATION(bias)

187

#endif // defined(HAS_BIAS)

188

)

189

{

190

// Each thread stores a 4x4/4x1 or 1x4 tile

191

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

192

193

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

194

195

// Load the values across the channels to compose the 6x6 or 6x1 tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

196

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

197

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

198

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

199

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

200

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

201

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

202

203

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

204

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

205

DATA_TYPE out00 = d00 + d01 + d02 + d03 + d04;

206

DATA_TYPE out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;

207

DATA_TYPE out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;

208

DATA_TYPE out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

209

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

210

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

211

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

212

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

213

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

214

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

215

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

216

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

217

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

218

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

219

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

220

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

221

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

222

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

223

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

224

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

225

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

226

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

227

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

228

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

229

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

230

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

231

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

232

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

233

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

234

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

235

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

236

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

237

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

238

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

239

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

240

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

241

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

242

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

243

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

244

245

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

246

DATA_TYPE out00 = d01 + d21 + d41 + d11 + d31;

247

DATA_TYPE out01 = d01 + d21 + d41 + d11 + d31;

248

DATA_TYPE out02 = d01 + d21 + d41 + d11 + d31;

249

DATA_TYPE out03 = d01 + d21 + d41 + d11 + d31;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

250

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

251

DATA_TYPE k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;

252

DATA_TYPE k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

253

254

out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;

255

out01 += k1 - d02 - d12 - d22 - d32 - d42;

256

out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;

257

out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;

258

259

// Compute out10, out11, out12 and out13

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

260

DATA_TYPE out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

261

DATA_TYPE out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

262

DATA_TYPE out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

263

DATA_TYPE out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

264

265

k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;

266

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;

267

268

out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;

269

out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;

270

out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;

271

out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;

272

273

// Compute out20, out21, out22 and out23

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

274

DATA_TYPE out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

275

DATA_TYPE out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

276

DATA_TYPE out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

277

DATA_TYPE out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

278

279

k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;

280

k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;

281

282

out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;

283

out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;

284

out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;

285

out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;

286

287

// Compute out30, out31, out32 and out33

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

288

DATA_TYPE out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

289

DATA_TYPE out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

290

DATA_TYPE out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

291

DATA_TYPE out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

292

293

k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;

294

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;

295

296

out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;

297

out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;

298

out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;

299

out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;

300

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

301

302

int y_in = get_global_id(1);

303

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

304

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

305

int z_out = get_global_id(0);

306

307

#if defined(HAS_BIAS)

308

// Add bias

309

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

310

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

311

DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

312

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

313

out00 += (DATA_TYPE)b;

314

out01 += (DATA_TYPE)b;

315

out02 += (DATA_TYPE)b;

316

out03 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

317

#endif // defined(HAS_BIAS)

318

319

// Get output address

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

320

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

321

322

// Store the output tile

323

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

324

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out00;

325

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out01;

326

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out02;

327

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out03;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

328

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

329

vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out00, out01, out02, out03), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

330

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

331

332

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

333

#if defined(HAS_BIAS)

334

// Add bias

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

335

out10 += (DATA_TYPE)b;

336

out11 += (DATA_TYPE)b;

337

out12 += (DATA_TYPE)b;

338

out13 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

339

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

340

out20 += (DATA_TYPE)b;

341

out21 += (DATA_TYPE)b;

342

out22 += (DATA_TYPE)b;

343

out23 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

344

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

345

out30 += (DATA_TYPE)b;

346

out31 += (DATA_TYPE)b;

347

out32 += (DATA_TYPE)b;

348

out33 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

349

#endif // defined(HAS_BIAS)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

350

vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out10, out11, out12, out13), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

351

vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out20, out21, out22, out23), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));

352

vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out30, out31, out32, out33), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

353

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

354

}

355

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

356

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

357

*

358

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

359

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

360

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

361

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

362

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

363

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

364

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

365

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

366

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

367

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

368

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

369

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

370

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

371

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

372

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

373

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

374

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

375

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

376

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

377

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

378

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

379

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

380

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

381

* @param[in] dst_size Size of the destination tensor, minus the last padding

382

*/

383

__kernel void winograd_output_transform_4x4_3x3_nhwc(

384

TENSOR3D_DECLARATION(src),

385

TENSOR3D_DECLARATION(dst),

386

#if defined(HAS_BIAS)

387

VECTOR_DECLARATION(bias),

388

#endif // defined(HAS_BIAS)

389

int dst_size)

390

{

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

391

// Each thread stores a 4x4/4x1 or 1x4 tile

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

392

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

393

394

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

395

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

396

// Load the values across the 36 channels to compose the 6x6 or 6x1 tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

397

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

398

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

399

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

400

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

401

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

402

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

403

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

404

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

405

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

406

DATA_TYPE out00 = d00 + d01 + d02 + d03 + d04;

407

DATA_TYPE out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;

408

DATA_TYPE out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;

409

DATA_TYPE out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

410

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

411

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

412

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

413

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

414

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

415

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

416

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

417

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

418

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

419

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

420

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

421

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

422

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

423

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

424

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

425

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

426

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

427

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

428

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

429

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

430

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

431

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

432

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

433

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

434

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

435

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

436

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

437

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

438

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

439

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

440

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

441

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

442

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

443

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

444

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

445

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

446

447

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

448

DATA_TYPE out00 = d01 + d21 + d41 + d11 + d31;

449

DATA_TYPE out01 = d01 + d21 + d41 + d11 + d31;

450

DATA_TYPE out02 = d01 + d21 + d41 + d11 + d31;

451

DATA_TYPE out03 = d01 + d21 + d41 + d11 + d31;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

452

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

453

DATA_TYPE k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;

454

DATA_TYPE k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

455

456

out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;

457

out01 += k1 - d02 - d12 - d22 - d32 - d42;

458

out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;

459

out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;

460

461

// Compute out10, out11, out12 and out13

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

462

DATA_TYPE out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

463

DATA_TYPE out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

464

DATA_TYPE out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

465

DATA_TYPE out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

466

467

k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;

468

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;

469

470

out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;

471

out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;

472

out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;

473

out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;

474

475

// Compute out20, out21, out22 and out23

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

476

DATA_TYPE out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

477

DATA_TYPE out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

478

DATA_TYPE out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

479

DATA_TYPE out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

480

481

k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;

482

k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;

483

484

out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;

485

out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;

486

out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;

487

out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;

488

489

// Compute out30, out31, out32 and out33

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

490

DATA_TYPE out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

491

DATA_TYPE out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

492

DATA_TYPE out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

493

DATA_TYPE out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

494

495

k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;

496

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;

497

498

out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;

499

out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;

500

out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;

501

out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

502

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

503

504

int y_in = get_global_id(1);

505

int x_out = get_global_id(0);

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

506

int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

507

int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

508

509

#if defined(HAS_BIAS)

510

// Add bias

511

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

512

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

513

DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

514

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

515

out00 += (DATA_TYPE)b;

516

out01 += (DATA_TYPE)b;

517

out02 += (DATA_TYPE)b;

518

out03 += (DATA_TYPE)b;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

519

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

520

out10 += (DATA_TYPE)b;

521

out11 += (DATA_TYPE)b;

522

out12 += (DATA_TYPE)b;

523

out13 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

524

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

525

out20 += (DATA_TYPE)b;

526

out21 += (DATA_TYPE)b;

527

out22 += (DATA_TYPE)b;

528

out23 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

529

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

530

out30 += (DATA_TYPE)b;

531

out31 += (DATA_TYPE)b;

532

out32 += (DATA_TYPE)b;

533

out33 += (DATA_TYPE)b;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

534

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

535

536

#endif // defined(HAS_BIAS)

537

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

538

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

539

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

Georgios Pinitas

2018-07-20 17:30:56 +0100

[diff] [blame]

540

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

541

542

// Store the 1x4 output tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

543

*((__global DATA_TYPE *)(dst_ptr + offset.s0)) = out00;

544

*((__global DATA_TYPE *)(dst_ptr + offset.s1)) = out01;

545

*((__global DATA_TYPE *)(dst_ptr + offset.s2)) = out02;

546

*((__global DATA_TYPE *)(dst_ptr + offset.s3)) = out03;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

547

#elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

548

// Store the 4x1 output tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

549

int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

Giorgio Arena

d02eb45

2018-07-18 11:45:30 +0100

[diff] [blame]

550

int mult_y = min(dst_size - offset, 1);

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

551

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

552

*((__global DATA_TYPE *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = out00;

553

*((__global DATA_TYPE *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = out01;

554

*((__global DATA_TYPE *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = out02;

555

*((__global DATA_TYPE *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = out03;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

556

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

557

// Get output address

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

558

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

Georgios Pinitas

2018-07-20 17:30:56 +0100

[diff] [blame]

559

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

560

int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

561

562

// Store the 4x4 output tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

563

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out00;

564

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out01;

565

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = out02;

566

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = out03;

567

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = out10;

568

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = out11;

569

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = out12;

570

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = out13;

571

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = out20;

572

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = out21;

573

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = out22;

574

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = out23;

575

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = out30;

576

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out31;

577

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out32;

578

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out33;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

579

580

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

581

}

582

583

#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \

584

({ \

585

comm_fact.s0 = d1 + d2; \

586

comm_fact.s1 = d3 + d4; \

587

comm_fact.s2 = d5 + d6; \

588

\

589

col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \

590

col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \

591

\

592

comm_fact.s0 = d1 - d2; \

593

comm_fact.s1 = d3 - d4; \

594

comm_fact.s2 = d5 - d6; \

595

\

596

col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \

597

col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \

598

})

599

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

600

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

601

*

602

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

603

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

604

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

605

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

606

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

607

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

608

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

609

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

610

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

611

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

612

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

613

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

614

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

615

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

616

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

617

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

618

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

619

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

620

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

621

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

622

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

623

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

624

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

625

*/

626

__kernel void winograd_output_transform_4x4_5x5_nchw(

627

TENSOR3D_DECLARATION(src),

628

TENSOR3D_DECLARATION(dst)

629

#if defined(HAS_BIAS)

630

,

631

VECTOR_DECLARATION(bias)

632

#endif // defined(HAS_BIAS)

633

)

634

{

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

635

// Each thread stores a 4x4/4x1 or 1x4 tile

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

636

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

637

638

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

639

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

640

// Compute output address

641

int y_in = get_global_id(1);

642

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

643

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

644

int z_out = get_global_id(0);

645

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

646

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

647

648

// Load the values across the channels to compose the input tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

649

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

650

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

651

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

652

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

653

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

654

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

655

DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

656

DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

657

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

658

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

659

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

660

DATA_TYPE out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;

661

DATA_TYPE out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;

662

DATA_TYPE out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;

663

DATA_TYPE out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

664

665

#if defined(HAS_BIAS)

666

// Add bias

667

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

668

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

669

DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

670

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

671

out00 += (DATA_TYPE)b;

672

out01 += (DATA_TYPE)b;

673

out02 += (DATA_TYPE)b;

674

out03 += (DATA_TYPE)b;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

675

#endif // defined(HAS_BIAS)

676

677

// Store the output tile

678

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

679

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out00;

680

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out01;

681

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out02;

682

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out03;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

683

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

684

vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out00, out01, out02, out03), 0, (__global DATA_TYPE *)(dst_addr));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

685

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

686

687

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

688

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

689

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

690

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

691

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

692

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

693

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

694

DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

695

DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

696

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

697

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

698

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

699

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

700

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

701

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

702

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

703

DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

704

DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

705

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

706

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

707

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

708

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

709

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

710

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

711

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

712

DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

713

DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

714

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

715

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

716

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

717

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

718

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

719

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));

720

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));

721

DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));

722

DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

723

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

724

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));

725

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));

726

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));

727

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));

728

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));

729

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));

730

DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));

731

DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

732

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

733

DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));

734

DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));

735

DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));

736

DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));

737

DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));

738

DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));

739

DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));

740

DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

741

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

742

DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));

743

DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));

744

DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));

745

DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));

746

DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));

747

DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));

748

DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));

749

DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

750

751

// Compute the 8x4 intermediate tensor

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

752

VEC_DATA_TYPE(DATA_TYPE, 4)

753

comm_fact0, comm_fact1, comm_fact2;

754

VEC_DATA_TYPE(DATA_TYPE, 4)

755

tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

756

757

COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);

758

COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);

759

COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);

760

COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);

761

COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);

762

COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);

763

COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);

764

COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);

765

766

// Compute the 4x4 output tile

767

comm_fact0 = tmp_col1 + tmp_col2;

768

comm_fact1 = tmp_col3 + tmp_col4;

769

comm_fact2 = tmp_col5 + tmp_col6;

770

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

771

VEC_DATA_TYPE(DATA_TYPE, 4)

772

out_col0 = comm_fact0 + comm_fact1 + (DATA_TYPE)8.f * comm_fact2 + tmp_col0;

773

VEC_DATA_TYPE(DATA_TYPE, 4)

774

out_col2 = comm_fact0 + (DATA_TYPE)4.f * comm_fact1 + (DATA_TYPE)2.f * comm_fact2;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

775

776

comm_fact0 = tmp_col1 - tmp_col2;

777

comm_fact1 = tmp_col3 - tmp_col4;

778

comm_fact2 = tmp_col5 - tmp_col6;

779

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

780

VEC_DATA_TYPE(DATA_TYPE, 4)

781

out_col1 = comm_fact0 + (DATA_TYPE)2.f * comm_fact1 + (DATA_TYPE)4.f * comm_fact2;

782

VEC_DATA_TYPE(DATA_TYPE, 4)

783

out_col3 = comm_fact0 + (DATA_TYPE)8.f * comm_fact1 + comm_fact2 + tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

784

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

785

#if defined(HAS_BIAS)

786

// Add bias

787

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

788

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

789

DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

790

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

791

out_col0 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;

792

out_col1 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;

793

out_col2 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;

794

out_col3 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

795

#endif // defined(HAS_BIAS)

796

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

797

// Store the output tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

798

vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

799

vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

800

vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));

801

vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

802

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

803

}

804

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

805

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

806

*

807

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

808

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

809

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

810

* @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

811

* @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

812

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

813

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

814

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

815

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

816

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

817

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

818

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

819

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

820

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

821

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

822

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

823

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

824

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

825

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

826

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

827

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

828

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

829

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

830

*/

831

__kernel void winograd_output_transform_4x4_5x5_nhwc(

832

TENSOR3D_DECLARATION(src),

833

TENSOR3D_DECLARATION(dst),

834

#if defined(HAS_BIAS)

835

VECTOR_DECLARATION(bias),

836

#endif // defined(HAS_BIAS)

837

int dst_size)

838

{

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

839

// Each thread stores a 4x4/4x1 or 1x4 tile

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

840

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

841

842

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

843

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

844

int y_in = get_global_id(1);

845

int x_out = get_global_id(0);

846

int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

847

int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

848

849

// Load the values across the channels to compose the input tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

850

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

851

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

852

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

853

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

854

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

855

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

856

DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

857

DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

858

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

859

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

860

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

861

DATA_TYPE out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;

862

DATA_TYPE out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;

863

DATA_TYPE out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;

864

DATA_TYPE out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

865

866

#if defined(HAS_BIAS)

867

// Add bias

868

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

869

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

870

DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

871

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

872

out00 += (DATA_TYPE)b;

873

out01 += (DATA_TYPE)b;

874

out02 += (DATA_TYPE)b;

875

out03 += (DATA_TYPE)b;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

876

#endif // defined(HAS_BIAS)

877

878

// Store the output tile

879

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

880

// Get output address

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

881

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

Georgios Pinitas

2018-07-20 17:30:56 +0100

[diff] [blame]

882

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

883

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

884

*(__global DATA_TYPE *)(dst_ptr + offset.s0) = out00;

885

*(__global DATA_TYPE *)(dst_ptr + offset.s1) = out01;

886

*(__global DATA_TYPE *)(dst_ptr + offset.s2) = out02;

887

*(__global DATA_TYPE *)(dst_ptr + offset.s3) = out03;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

888

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

889

// Get output address

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

890

int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

891

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

892

*(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out00;

893

*(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out01;

894

*(__global DATA_TYPE *)(dst_ptr + 2 * dst_stride_y + offset) = out02;

895

*(__global DATA_TYPE *)(dst_ptr + 3 * dst_stride_y + offset) = out03;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

896

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

897

898

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

899

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

900

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

901

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

902

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

903

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

904

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

905

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

906

DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

907

DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

908

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

909

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

910

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

911

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

912

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

913

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

914

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

915

DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

916

DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

917

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

918

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

919

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

920

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

921

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

922

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

923

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

924

DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

925

DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

926

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

927

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

928

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

929

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

930

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

931

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));

932

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));

933

DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));

934

DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

935

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

936

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));

937

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));

938

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));

939

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));

940

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));

941

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));

942

DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));

943

DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

944

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

945

DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));

946

DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));

947

DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));

948

DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));

949

DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));

950

DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));

951

DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));

952

DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

953

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

954

DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));

955

DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));

956

DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));

957

DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));

958

DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));

959

DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));

960

DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));

961

DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

962

963

// Compute the 8x4 intermediate tensor

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

964

VEC_DATA_TYPE(DATA_TYPE, 4)

965

comm_fact0, comm_fact1, comm_fact2;

966

VEC_DATA_TYPE(DATA_TYPE, 4)

967

tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

968

969

COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);

970

COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);

971

COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);

972

COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);

973

COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);

974

COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);

975

COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);

976

COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);

977

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

978

// Compute the output tile

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

979

comm_fact0 = tmp_col1 + tmp_col2;

980

comm_fact1 = tmp_col3 + tmp_col4;

981

comm_fact2 = tmp_col5 + tmp_col6;

982

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

983

VEC_DATA_TYPE(DATA_TYPE, 4)

984

out_col0 = comm_fact0 + comm_fact1 + (DATA_TYPE)8.f * comm_fact2 + tmp_col0;

985

VEC_DATA_TYPE(DATA_TYPE, 4)

986

out_col2 = comm_fact0 + (DATA_TYPE)4.f * comm_fact1 + (DATA_TYPE)2.f * comm_fact2;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

987

988

comm_fact0 = tmp_col1 - tmp_col2;

989

comm_fact1 = tmp_col3 - tmp_col4;

990

comm_fact2 = tmp_col5 - tmp_col6;

991

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

992

VEC_DATA_TYPE(DATA_TYPE, 4)

993

out_col1 = comm_fact0 + (DATA_TYPE)2.f * comm_fact1 + (DATA_TYPE)4.f * comm_fact2;

994

VEC_DATA_TYPE(DATA_TYPE, 4)

995

out_col3 = comm_fact0 + (DATA_TYPE)8.f * comm_fact1 + comm_fact2 + tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

996

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

997

#if defined(HAS_BIAS)

998

// Add bias

999

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

1000

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1001

DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1002

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1003

out_col0 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;

1004

out_col1 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;

1005

out_col2 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;

1006

out_col3 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1007

#endif // defined(HAS_BIAS)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1008

// Get output address

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1009

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

Georgios Pinitas

2018-07-20 17:30:56 +0100

[diff] [blame]

1010

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

1011

int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1012

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1013

// Store the output tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1014

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0.s0;

1015

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1.s0;

1016

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * (int)dst_stride_y + offset.s0) = out_col2.s0;

1017

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * (int)dst_stride_y + offset.s0) = out_col3.s0;

1018

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0.s1;

1019

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1.s1;

1020

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * (int)dst_stride_y + offset.s1) = out_col2.s1;

1021

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * (int)dst_stride_y + offset.s1) = out_col3.s1;

1022

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * (int)dst_stride_y + offset.s2) = out_col0.s2;

1023

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * (int)dst_stride_y + offset.s2) = out_col1.s2;

1024

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * (int)dst_stride_y + offset.s2) = out_col2.s2;

1025

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * (int)dst_stride_y + offset.s2) = out_col3.s2;

1026

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * (int)dst_stride_y + offset.s3) = out_col0.s3;

1027

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * (int)dst_stride_y + offset.s3) = out_col1.s3;

1028

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * (int)dst_stride_y + offset.s3) = out_col2.s3;

1029

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * (int)dst_stride_y + offset.s3) = out_col3.s3;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1030

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1031

}

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1032

1033

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

1034

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW

1035

*

1036

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1037

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

1038

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1039

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1040

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1041

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1042

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1043

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1044

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1045

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1046

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1047

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1048

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1049

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1050

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1051

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1052

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1053

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1054

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1055

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1056

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1057

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1058

*/

1059

__kernel void winograd_output_transform_2x1_3x1_nchw(

1060

TENSOR3D_DECLARATION(src),

1061

TENSOR3D_DECLARATION(dst)

1062

#if defined(HAS_BIAS)

1063

,

1064

VECTOR_DECLARATION(bias)

1065

#endif // defined(HAS_BIAS)

1066

)

1067

{

1068

winograd_output_transform_2x2_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes

1084

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1090

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW

1095

*

1096

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1097

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1098

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1099

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1100

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1101

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1102

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1103

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1104

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1105

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1106

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1107

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1108

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1109

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1110

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1111

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1112

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1113

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1114

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1115

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1116

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1117

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1118

*/

1119

__kernel void winograd_output_transform_4x1_3x1_nchw(

1120

TENSOR3D_DECLARATION(src),

1121

TENSOR3D_DECLARATION(dst)

1122

#if defined(HAS_BIAS)

1123

,

1124

VECTOR_DECLARATION(bias)

1125

#endif // defined(HAS_BIAS)

1126

)

1127

{

1128

winograd_output_transform_4x4_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes

1144

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1150

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW

1155

*

1156

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1157

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1158

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1159

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1160

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1161

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1162

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1163

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1164

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1165

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1166

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1167

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1168

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1169

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1170

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1171

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1172

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1173

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1174

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1175

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1176

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1177

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1178

*/

1179

__kernel void winograd_output_transform_4x1_5x1_nchw(

1180

TENSOR3D_DECLARATION(src),

1181

TENSOR3D_DECLARATION(dst)

1182

#if defined(HAS_BIAS)

1183

,

1184

VECTOR_DECLARATION(bias)

1185

#endif // defined(HAS_BIAS)

1186

)

1187

{

1188

winograd_output_transform_4x4_5x5_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes

1204

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1210

#endif // defined(HAS_BIAS)

1211

);

1212

}

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1213

1214

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC

1215

*

1216

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1217

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1218

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1219

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1220

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1221

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1222

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1223

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1224

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1225

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1226

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1227

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1228

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1229

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1230

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1231

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1232

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1233

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1234

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1235

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1236

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1237

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1238

*/

1239

__kernel void winograd_output_transform_4x1_3x1_nhwc(

1240

TENSOR3D_DECLARATION(src),

1241

TENSOR3D_DECLARATION(dst),

1242

#if defined(HAS_BIAS)

1243

VECTOR_DECLARATION(bias),

1244

#endif // defined(HAS_BIAS)

1245

int dst_size)

1246

{

1247

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes,

1263

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1268

#endif // defined(HAS_BIAS)

dst_size);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC

1273

*

1274

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1275

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1276

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1277

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1278

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1279

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1280

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1281

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1282

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1283

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1284

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1285

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1286

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1287

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1288

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1289

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1290

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1291

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1292

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1293

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1294

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1295

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1296

*/

1297

__kernel void winograd_output_transform_4x1_5x1_nhwc(

1298

TENSOR3D_DECLARATION(src),

1299

TENSOR3D_DECLARATION(dst),

1300

#if defined(HAS_BIAS)

1301

VECTOR_DECLARATION(bias),

1302

#endif // defined(HAS_BIAS)

1303

int dst_size)

1304

{

1305

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes,

1321

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1326

#endif // defined(HAS_BIAS)

1327

dst_size);

1328

}

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1329

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

1330

1331

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1332

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW

1333

*

1334

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1335

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1336

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

1337

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1338

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1339

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1340

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1341

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1342

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1343

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1344

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1345

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1346

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1347

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1348

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1349

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1350

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1351

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1352

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1353

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1354

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1355

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1356

*/

1357

__kernel void winograd_output_transform_1x2_1x3_nchw(

1358

TENSOR3D_DECLARATION(src),

1359

TENSOR3D_DECLARATION(dst)

1360

#if defined(HAS_BIAS)

1361

,

1362

VECTOR_DECLARATION(bias)

1363

#endif // defined(HAS_BIAS)

1364

)

1365

{

1366

winograd_output_transform_2x2_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes

1382

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1388

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW

1393

*

1394

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1395

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1396

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1397

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1398

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1399

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1400

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1401

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1402

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1403

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1404

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1405

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1406

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1407

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1408

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1409

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1410

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1411

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1412

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1413

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1414

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1415

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1416

*/

1417

__kernel void winograd_output_transform_1x4_1x3_nchw(

1418

TENSOR3D_DECLARATION(src),

1419

TENSOR3D_DECLARATION(dst)

1420

#if defined(HAS_BIAS)

1421

,

1422

VECTOR_DECLARATION(bias)

1423

#endif // defined(HAS_BIAS)

1424

)

1425

{

1426

winograd_output_transform_4x4_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes

1442

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1448

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW

1453

*

1454

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1455

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1456

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1457

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1458

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1459

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1460

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1461

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1462

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1463

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1464

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1465

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1466

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1467

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1468

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1469

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1470

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1471

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1472

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1473

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1474

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1475

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1476

*/

1477

__kernel void winograd_output_transform_1x4_1x5_nchw(

1478

TENSOR3D_DECLARATION(src),

1479

TENSOR3D_DECLARATION(dst)

1480

#if defined(HAS_BIAS)

1481

,

1482

VECTOR_DECLARATION(bias)

1483

#endif // defined(HAS_BIAS)

1484

)

1485

{

1486

winograd_output_transform_4x4_5x5_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes

1502

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1508

#endif // defined(HAS_BIAS)

1509

);

1510

}

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1511

1512

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC

1513

*

1514

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1515

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1516

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1517

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1518

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1519

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1520

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1521

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1522

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1523

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1524

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1525

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1526

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1527

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1528

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1529

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1530

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1531

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1532

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1533

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1534

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1535

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1536

*/

1537

__kernel void winograd_output_transform_1x4_1x3_nhwc(

1538

TENSOR3D_DECLARATION(src),

1539

TENSOR3D_DECLARATION(dst),

1540

#if defined(HAS_BIAS)

1541

VECTOR_DECLARATION(bias),

1542

#endif // defined(HAS_BIAS)

1543

int dst_size)

1544

{

1545

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes,

1561

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1566

#endif // defined(HAS_BIAS)

dst_size);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC

1571

*

1572

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1573

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1574

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1575

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1576

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1577

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1578

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1579

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1580

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1581

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1582

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1583

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1584

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1585

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1586

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1587

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1588

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1589

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1590

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1591

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1592

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1593

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1594

*/

1595

__kernel void winograd_output_transform_1x4_1x5_nhwc(

1596

TENSOR3D_DECLARATION(src),

1597

TENSOR3D_DECLARATION(dst),

1598

#if defined(HAS_BIAS)

1599

VECTOR_DECLARATION(bias),

1600

#endif // defined(HAS_BIAS)

1601

int dst_size)

1602

{

1603

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes,

1619

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1624

#endif // defined(HAS_BIAS)

1625

dst_size);

1626

}

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1627

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena