Blame - src/core/CL/cl_kernels/winograd_output_transform.cl - ml/ComputeLibrary

2018-08-31 16:10:16 +0100

[diff] [blame]

34

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

35

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

36

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

37

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

38

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

39

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

40

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

41

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

42

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

43

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

44

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

45

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

46

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

47

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

48

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

49

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

50

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

51

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

52

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

53

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

54

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

55

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

56

*/

57

__kernel void winograd_output_transform_2x2_3x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

58

TENSOR4D_DECLARATION(src),

59

TENSOR4D_DECLARATION(dst)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

60

#if defined(HAS_BIAS)

61

,

62

VECTOR_DECLARATION(bias)

63

#endif // defined(HAS_BIAS)

64

)

65

{

66

// Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

67

#if defined(SRC_DEPTH)

68

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

69

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

70

#else /* defined(SRC_DEPTH) */

71

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

72

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

73

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

74

75

// Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

76

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

77

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

78

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

79

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

80

81

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

82

// Compute the 2x1 or 1x2 output tile

83

// out00 = d00 + d01 + d02

84

// out01 = d01 - d02 - d03

85

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

86

DATA_TYPE out00 = d00 + d01 + d02;

87

DATA_TYPE out01 = d01 - d02 - d03;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

88

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

89

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

90

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

91

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

92

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

93

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

94

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

95

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

96

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

97

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

98

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

99

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

100

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

101

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

102

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

103

104

// Compute the 2x2 output tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

105

DATA_TYPE k0 = d01 + d11 + d21;

106

DATA_TYPE k1 = d02 + d12 + d22;

107

DATA_TYPE k2 = d11 - d21 - d31;

108

DATA_TYPE k3 = d12 - d22 - d32;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

109

110

// out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22

111

// out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)

112

// out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)

113

// out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)

114

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

115

DATA_TYPE out00 = d10;

116

DATA_TYPE out01 = -d13;

117

DATA_TYPE out10 = d10;

118

DATA_TYPE out11 = -d13;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

119

120

out00 += d00 + d20 + k0 + k1;

121

out01 += k0 - k1 - (d03 + d23);

122

out10 += -d20 - d30 + k2 + k3;

123

out11 += k2 - k3 + d23 + d33;

124

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

125

126

int y_in = get_global_id(1);

127

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

128

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

129

int z_out = get_global_id(0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

130

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

131

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

132

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

133

134

#if defined(HAS_BIAS)

135

// Add bias

136

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

137

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

138

DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

139

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

140

out00 += (DATA_TYPE)b;

141

out01 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

142

#endif // defined(HAS_BIAS)

143

144

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

145

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

146

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

147

#else /* defined(SRC_DEPTH) */

148

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

149

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

150

151

// Store the output tile

152

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

153

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out00;

154

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out01;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

155

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

156

vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(out00, out01), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

157

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

158

159

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

160

#if defined(HAS_BIAS)

161

// Add bias

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

162

out10 += (DATA_TYPE)b;

163

out11 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

164

#endif // defined(HAS_BIAS)

165

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

166

vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(out10, out11), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

167

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

168

}

169

170

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW

171

*

172

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

173

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

174

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

175

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

176

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

177

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

178

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

179

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

180

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

181

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

182

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

183

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

184

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

185

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

186

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

187

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

188

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

189

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

190

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

191

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

192

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

193

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

194

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

195

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

196

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

197

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

198

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

199

*/

200

__kernel void winograd_output_transform_4x4_3x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

201

TENSOR4D_DECLARATION(src),

202

TENSOR4D_DECLARATION(dst)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

203

#if defined(HAS_BIAS)

204

,

205

VECTOR_DECLARATION(bias)

206

#endif // defined(HAS_BIAS)

207

)

208

{

209

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

210

#if defined(SRC_DEPTH)

211

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

212

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

213

#else /* defined(SRC_DEPTH) */

214

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

215

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

216

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

217

218

// Load the values across the channels to compose the 6x6 or 6x1 tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

219

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

220

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

221

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

222

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

223

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

224

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

225

226

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

227

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

228

DATA_TYPE out00 = d00 + d01 + d02 + d03 + d04;

229

DATA_TYPE out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;

230

DATA_TYPE out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;

231

DATA_TYPE out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

232

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

233

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

234

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

235

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

236

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

237

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

238

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

239

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

240

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

241

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

242

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

243

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

244

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

245

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

246

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

247

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

248

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

249

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

250

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

251

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

252

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

253

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

254

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

255

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

256

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

257

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

258

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

259

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

260

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

261

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

262

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

263

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

264

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

265

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

266

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

267

268

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

269

DATA_TYPE out00 = d01 + d21 + d41 + d11 + d31;

270

DATA_TYPE out01 = d01 + d21 + d41 + d11 + d31;

271

DATA_TYPE out02 = d01 + d21 + d41 + d11 + d31;

272

DATA_TYPE out03 = d01 + d21 + d41 + d11 + d31;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

273

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

274

DATA_TYPE k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;

275

DATA_TYPE k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

276

277

out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;

278

out01 += k1 - d02 - d12 - d22 - d32 - d42;

279

out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;

280

out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;

281

282

// Compute out10, out11, out12 and out13

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

283

DATA_TYPE out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

284

DATA_TYPE out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

285

DATA_TYPE out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

286

DATA_TYPE out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

287

288

k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;

289

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;

290

291

out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;

292

out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;

293

out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;

294

out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;

295

296

// Compute out20, out21, out22 and out23

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

297

DATA_TYPE out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

298

DATA_TYPE out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

299

DATA_TYPE out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

300

DATA_TYPE out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

301

302

k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;

303

k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;

304

305

out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;

306

out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;

307

out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;

308

out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;

309

310

// Compute out30, out31, out32 and out33

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

311

DATA_TYPE out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

312

DATA_TYPE out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

313

DATA_TYPE out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

314

DATA_TYPE out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

315

316

k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;

317

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;

318

319

out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;

320

out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;

321

out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;

322

out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;

323

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

324

325

int y_in = get_global_id(1);

326

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

327

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

328

int z_out = get_global_id(0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

329

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

330

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

331

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

332

333

#if defined(HAS_BIAS)

334

// Add bias

335

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

336

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

337

DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

338

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

339

out00 += (DATA_TYPE)b;

340

out01 += (DATA_TYPE)b;

341

out02 += (DATA_TYPE)b;

342

out03 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

343

#endif // defined(HAS_BIAS)

344

345

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

346

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

347

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

348

#else /* defined(SRC_DEPTH) */

349

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

350

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

351

352

// Store the output tile

353

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

354

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out00;

355

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out01;

356

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out02;

357

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out03;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

358

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

359

vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out00, out01, out02, out03), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

360

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

361

362

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

363

#if defined(HAS_BIAS)

364

// Add bias

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

365

out10 += (DATA_TYPE)b;

366

out11 += (DATA_TYPE)b;

367

out12 += (DATA_TYPE)b;

368

out13 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

369

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

370

out20 += (DATA_TYPE)b;

371

out21 += (DATA_TYPE)b;

372

out22 += (DATA_TYPE)b;

373

out23 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

374

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

375

out30 += (DATA_TYPE)b;

376

out31 += (DATA_TYPE)b;

377

out32 += (DATA_TYPE)b;

378

out33 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

379

#endif // defined(HAS_BIAS)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

380

vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out10, out11, out12, out13), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

381

vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out20, out21, out22, out23), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));

382

vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out30, out31, out32, out33), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

383

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

384

}

385

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

386

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

387

*

388

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

389

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

390

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

391

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

392

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

393

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

394

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

395

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

396

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

397

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

398

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

399

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

400

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

401

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

402

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

403

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

404

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

405

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

406

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

407

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

408

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

409

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

410

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

411

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

412

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

413

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

414

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

415

* @param[in] dst_size Size of the destination tensor, minus the last padding

416

*/

417

__kernel void winograd_output_transform_4x4_3x3_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

418

TENSOR4D_DECLARATION(src),

419

TENSOR4D_DECLARATION(dst),

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

420

#if defined(HAS_BIAS)

421

VECTOR_DECLARATION(bias),

422

#endif // defined(HAS_BIAS)

423

int dst_size)

424

{

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

425

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

426

#if defined(SRC_DEPTH)

427

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

428

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

429

#else /* defined(SRC_DEPTH) */

430

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

431

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

432

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

433

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

434

// Load the values across the 36 channels to compose the 6x6 or 6x1 tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

435

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

436

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

437

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

438

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

439

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

440

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

441

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

442

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

443

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

444

DATA_TYPE out00 = d00 + d01 + d02 + d03 + d04;

445

DATA_TYPE out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;

446

DATA_TYPE out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;

447

DATA_TYPE out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

448

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

449

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

450

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

451

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

452

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

453

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

454

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

455

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

456

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

457

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

458

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

459

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

460

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

461

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

462

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

463

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

464

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

465

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

466

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

467

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

468

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

469

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

470

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

471

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

472

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

473

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

474

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

475

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

476

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

477

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

478

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

479

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

480

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

481

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

482

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

483

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

484

485

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

486

DATA_TYPE out00 = d01 + d21 + d41 + d11 + d31;

487

DATA_TYPE out01 = d01 + d21 + d41 + d11 + d31;

488

DATA_TYPE out02 = d01 + d21 + d41 + d11 + d31;

489

DATA_TYPE out03 = d01 + d21 + d41 + d11 + d31;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

490

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

491

DATA_TYPE k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;

492

DATA_TYPE k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

493

494

out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;

495

out01 += k1 - d02 - d12 - d22 - d32 - d42;

496

out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;

497

out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;

498

499

// Compute out10, out11, out12 and out13

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

500

DATA_TYPE out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

501

DATA_TYPE out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

502

DATA_TYPE out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

503

DATA_TYPE out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

504

505

k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;

506

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;

507

508

out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;

509

out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;

510

out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;

511

out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;

512

513

// Compute out20, out21, out22 and out23

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

514

DATA_TYPE out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

515

DATA_TYPE out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

516

DATA_TYPE out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

517

DATA_TYPE out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

518

519

k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;

520

k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;

521

522

out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;

523

out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;

524

out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;

525

out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;

526

527

// Compute out30, out31, out32 and out33

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

528

DATA_TYPE out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

529

DATA_TYPE out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

530

DATA_TYPE out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

531

DATA_TYPE out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

532

533

k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;

534

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;

535

536

out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;

537

out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;

538

out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;

539

out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

540

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

541

542

int y_in = get_global_id(1);

543

int x_out = get_global_id(0);

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

544

int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

545

int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

546

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

547

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

548

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

549

550

#if defined(HAS_BIAS)

551

// Add bias

552

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

553

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

554

DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

555

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

556

out00 += (DATA_TYPE)b;

557

out01 += (DATA_TYPE)b;

558

out02 += (DATA_TYPE)b;

559

out03 += (DATA_TYPE)b;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

560

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

561

out10 += (DATA_TYPE)b;

562

out11 += (DATA_TYPE)b;

563

out12 += (DATA_TYPE)b;

564

out13 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

565

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

566

out20 += (DATA_TYPE)b;

567

out21 += (DATA_TYPE)b;

568

out22 += (DATA_TYPE)b;

569

out23 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

570

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

571

out30 += (DATA_TYPE)b;

572

out31 += (DATA_TYPE)b;

573

out32 += (DATA_TYPE)b;

574

out33 += (DATA_TYPE)b;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

575

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

576

577

#endif // defined(HAS_BIAS)

578

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

579

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

580

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

581

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

582

#else /* defined(SRC_DEPTH) */

583

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

584

#endif /* defined(SRC_DEPTH) */

585

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

586

587

// Store the 1x4 output tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

588

*((__global DATA_TYPE *)(dst_ptr + offset.s0)) = out00;

589

*((__global DATA_TYPE *)(dst_ptr + offset.s1)) = out01;

590

*((__global DATA_TYPE *)(dst_ptr + offset.s2)) = out02;

591

*((__global DATA_TYPE *)(dst_ptr + offset.s3)) = out03;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

592

#elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

593

// Store the 4x1 output tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

594

int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

Giorgio Arena

d02eb45

2018-07-18 11:45:30 +0100

[diff] [blame]

595

int mult_y = min(dst_size - offset, 1);

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

596

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

597

*((__global DATA_TYPE *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = out00;

598

*((__global DATA_TYPE *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = out01;

599

*((__global DATA_TYPE *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = out02;

600

*((__global DATA_TYPE *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = out03;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

601

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

602

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

603

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

604

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

605

#else /* defined(SRC_DEPTH) */

606

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

607

#endif /* defined(SRC_DEPTH) */

Georgios Pinitas

b0b3717

2018-07-20 17:30:56 +0100

[diff] [blame]

608

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

609

int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

610

611

// Store the 4x4 output tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

612

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out00;

613

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out01;

614

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = out02;

615

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = out03;

616

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = out10;

617

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = out11;

618

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = out12;

619

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = out13;

620

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = out20;

621

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = out21;

622

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = out22;

623

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = out23;

624

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = out30;

625

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out31;

626

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out32;

627

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out33;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

628

629

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

630

}

631

632

#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \

633

({ \

634

comm_fact.s0 = d1 + d2; \

635

comm_fact.s1 = d3 + d4; \

636

comm_fact.s2 = d5 + d6; \

637

\

638

col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \

639

col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \

640

\

641

comm_fact.s0 = d1 - d2; \

642

comm_fact.s1 = d3 - d4; \

643

comm_fact.s2 = d5 - d6; \

644

\

645

col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \

646

col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \

647

})

648

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

649

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

650

*

651

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

652

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

653

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

654

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

655

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

656

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

657

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

658

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

659

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

660

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

661

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

662

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

663

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

664

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

665

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

666

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

667

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

668

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

669

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

670

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

671

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

672

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

673

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

674

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

675

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

676

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

677

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

678

*/

679

__kernel void winograd_output_transform_4x4_5x5_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

680

TENSOR4D_DECLARATION(src),

681

TENSOR4D_DECLARATION(dst)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

682

#if defined(HAS_BIAS)

683

,

684

VECTOR_DECLARATION(bias)

685

#endif // defined(HAS_BIAS)

686

)

687

{

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

688

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

689

#if defined(SRC_DEPTH)

690

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

691

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

692

#else /* defined(SRC_DEPTH) */

693

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

694

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

695

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

696

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

697

// Compute output address

698

int y_in = get_global_id(1);

699

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

700

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

701

int z_out = get_global_id(0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

702

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

703

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

704

#endif /* defined(SRC_DEPTH) */

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

705

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

706

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

707

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

708

#else /* defined(SRC_DEPTH) */

709

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

710

#endif /* defined(SRC_DEPTH) */

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

711

712

// Load the values across the channels to compose the input tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

713

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

714

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

715

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

716

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

717

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

718

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

719

DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

720

DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

721

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

722

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

723

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

724

DATA_TYPE out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;

725

DATA_TYPE out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;

726

DATA_TYPE out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;

727

DATA_TYPE out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

728

729

#if defined(HAS_BIAS)

730

// Add bias

731

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

732

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

733

DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

734

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

735

out00 += (DATA_TYPE)b;

736

out01 += (DATA_TYPE)b;

737

out02 += (DATA_TYPE)b;

738

out03 += (DATA_TYPE)b;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

739

#endif // defined(HAS_BIAS)

740

741

// Store the output tile

742

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

743

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out00;

744

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out01;

745

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out02;

746

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out03;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

747

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

748

vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out00, out01, out02, out03), 0, (__global DATA_TYPE *)(dst_addr));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

749

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

750

751

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

752

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

753

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

754

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

755

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

756

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

757

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

758

DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

759

DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

760

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

761

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

762

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

763

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

764

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

765

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

766

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

767

DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

768

DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

769

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

770

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

771

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

772

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

773

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

774

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

775

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

776

DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

777

DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

778

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

779

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

780

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

781

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

782

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

783

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));

784

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));

785

DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));

786

DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

787

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

788

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));

789

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));

790

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));

791

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));

792

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));

793

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));

794

DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));

795

DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

796

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

797

DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));

798

DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));

799

DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));

800

DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));

801

DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));

802

DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));

803

DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));

804

DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

805

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

806

DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));

807

DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));

808

DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));

809

DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));

810

DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));

811

DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));

812

DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));

813

DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

814

815

// Compute the 8x4 intermediate tensor

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

816

VEC_DATA_TYPE(DATA_TYPE, 4)

817

comm_fact0, comm_fact1, comm_fact2;

818

VEC_DATA_TYPE(DATA_TYPE, 4)

819

tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

820

821

COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);

822

COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);

823

COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);

824

COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);

825

COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);

826

COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);

827

COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);

828

COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);

829

830

// Compute the 4x4 output tile

831

comm_fact0 = tmp_col1 + tmp_col2;

832

comm_fact1 = tmp_col3 + tmp_col4;

833

comm_fact2 = tmp_col5 + tmp_col6;

834

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

835

VEC_DATA_TYPE(DATA_TYPE, 4)

836

out_col0 = comm_fact0 + comm_fact1 + (DATA_TYPE)8.f * comm_fact2 + tmp_col0;

837

VEC_DATA_TYPE(DATA_TYPE, 4)

838

out_col2 = comm_fact0 + (DATA_TYPE)4.f * comm_fact1 + (DATA_TYPE)2.f * comm_fact2;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

839

840

comm_fact0 = tmp_col1 - tmp_col2;

841

comm_fact1 = tmp_col3 - tmp_col4;

842

comm_fact2 = tmp_col5 - tmp_col6;

843

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

844

VEC_DATA_TYPE(DATA_TYPE, 4)

845

out_col1 = comm_fact0 + (DATA_TYPE)2.f * comm_fact1 + (DATA_TYPE)4.f * comm_fact2;

846

VEC_DATA_TYPE(DATA_TYPE, 4)

847

out_col3 = comm_fact0 + (DATA_TYPE)8.f * comm_fact1 + comm_fact2 + tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

848

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

849

#if defined(HAS_BIAS)

850

// Add bias

851

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

852

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

853

DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

854

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

855

out_col0 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;

856

out_col1 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;

857

out_col2 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;

858

out_col3 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

859

#endif // defined(HAS_BIAS)

860

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

861

// Store the output tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

862

vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

863

vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

864

vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));

865

vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

866

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

867

}

868

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

869

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

870

*

871

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

872

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

873

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

874

* @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

875

* @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

876

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

877

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

878

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

879

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

880

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

881

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

882

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

883

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

884

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

885

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

886

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

887

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

888

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

889

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

890

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

891

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

892

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

893

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

894

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

895

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

896

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

897

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

898

*/

899

__kernel void winograd_output_transform_4x4_5x5_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

900

TENSOR4D_DECLARATION(src),

901

TENSOR4D_DECLARATION(dst),

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

902

#if defined(HAS_BIAS)

903

VECTOR_DECLARATION(bias),

904

#endif // defined(HAS_BIAS)

905

int dst_size)

906

{

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

907

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

908

#if defined(SRC_DEPTH)

909

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

910

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

911

#else /* defined(SRC_DEPTH) */

912

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

913

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

914

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

915

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

916

int y_in = get_global_id(1);

917

int x_out = get_global_id(0);

918

int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

919

int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

920

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

921

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

922

#endif /* defined(SRC_DEPTH) */

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

923

924

// Load the values across the channels to compose the input tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

925

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

926

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

927

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

928

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

929

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

930

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

931

DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

932

DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

933

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

934

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

935

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

936

DATA_TYPE out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;

937

DATA_TYPE out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;

938

DATA_TYPE out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;

939

DATA_TYPE out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

940

941

#if defined(HAS_BIAS)

942

// Add bias

943

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

944

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

945

DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

946

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

947

out00 += (DATA_TYPE)b;

948

out01 += (DATA_TYPE)b;

949

out02 += (DATA_TYPE)b;

950

out03 += (DATA_TYPE)b;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

951

#endif // defined(HAS_BIAS)

952

953

// Store the output tile

954

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

955

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

956

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

957

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

958

#else /* defined(SRC_DEPTH) */

959

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

960

#endif /* defined(SRC_DEPTH) */

961

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

962

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

963

*(__global DATA_TYPE *)(dst_ptr + offset.s0) = out00;

964

*(__global DATA_TYPE *)(dst_ptr + offset.s1) = out01;

965

*(__global DATA_TYPE *)(dst_ptr + offset.s2) = out02;

966

*(__global DATA_TYPE *)(dst_ptr + offset.s3) = out03;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

967

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

968

// Get output address

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

969

int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

970

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

971

*(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out00;

972

*(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out01;

973

*(__global DATA_TYPE *)(dst_ptr + 2 * dst_stride_y + offset) = out02;

974

*(__global DATA_TYPE *)(dst_ptr + 3 * dst_stride_y + offset) = out03;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

975

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

976

977

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

978

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

979

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

980

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

981

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

982

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

983

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

984

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

985

DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

986

DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

987

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

988

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

989

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

990

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

991

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

992

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

993

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

994

DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

995

DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

996

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

997

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

998

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

999

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

1000

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

1001

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

1002

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

1003

DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

1004

DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1005

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1006

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

1007

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

1008

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

1009

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

1010

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));

1011

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));

1012

DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));

1013

DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1014

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1015

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));

1016

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));

1017

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));

1018

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));

1019

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));

1020

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));

1021

DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));

1022

DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1023

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1024

DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));

1025

DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));

1026

DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));

1027

DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));

1028

DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));

1029

DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));

1030

DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));

1031

DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1032

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1033

DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));

1034

DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));

1035

DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));

1036

DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));

1037

DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));

1038

DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));

1039

DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));

1040

DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1041

1042

// Compute the 8x4 intermediate tensor

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1043

VEC_DATA_TYPE(DATA_TYPE, 4)

1044

comm_fact0, comm_fact1, comm_fact2;

1045

VEC_DATA_TYPE(DATA_TYPE, 4)

1046

tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1047

1048

COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);

1049

COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);

1050

COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);

1051

COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);

1052

COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);

1053

COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);

1054

COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);

1055

COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);

1056

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1057

// Compute the output tile

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1058

comm_fact0 = tmp_col1 + tmp_col2;

1059

comm_fact1 = tmp_col3 + tmp_col4;

1060

comm_fact2 = tmp_col5 + tmp_col6;

1061

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1062

VEC_DATA_TYPE(DATA_TYPE, 4)

1063

out_col0 = comm_fact0 + comm_fact1 + (DATA_TYPE)8.f * comm_fact2 + tmp_col0;

1064

VEC_DATA_TYPE(DATA_TYPE, 4)

1065

out_col2 = comm_fact0 + (DATA_TYPE)4.f * comm_fact1 + (DATA_TYPE)2.f * comm_fact2;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1066

1067

comm_fact0 = tmp_col1 - tmp_col2;

1068

comm_fact1 = tmp_col3 - tmp_col4;

1069

comm_fact2 = tmp_col5 - tmp_col6;

1070

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1071

VEC_DATA_TYPE(DATA_TYPE, 4)

1072

out_col1 = comm_fact0 + (DATA_TYPE)2.f * comm_fact1 + (DATA_TYPE)4.f * comm_fact2;

1073

VEC_DATA_TYPE(DATA_TYPE, 4)

1074

out_col3 = comm_fact0 + (DATA_TYPE)8.f * comm_fact1 + comm_fact2 + tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1075

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1076

#if defined(HAS_BIAS)

1077

// Add bias

1078

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

1079

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1080

DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1081

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1082

out_col0 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;

1083

out_col1 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;

1084

out_col2 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;

1085

out_col3 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1086

#endif // defined(HAS_BIAS)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1087

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1088

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1089

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1090

#else /* defined(SRC_DEPTH) */

1091

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

1092

#endif /* defined(SRC_DEPTH) */

Georgios Pinitas

b0b3717

2018-07-20 17:30:56 +0100

[diff] [blame]

1093

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

1094

int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1095

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1096

// Store the output tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1097

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0.s0;

1098

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1.s0;

1099

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * (int)dst_stride_y + offset.s0) = out_col2.s0;

1100

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * (int)dst_stride_y + offset.s0) = out_col3.s0;

1101

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0.s1;

1102

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1.s1;

1103

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * (int)dst_stride_y + offset.s1) = out_col2.s1;

1104

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * (int)dst_stride_y + offset.s1) = out_col3.s1;

1105

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * (int)dst_stride_y + offset.s2) = out_col0.s2;

1106

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * (int)dst_stride_y + offset.s2) = out_col1.s2;

1107

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * (int)dst_stride_y + offset.s2) = out_col2.s2;

1108

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * (int)dst_stride_y + offset.s2) = out_col3.s2;

1109

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * (int)dst_stride_y + offset.s3) = out_col0.s3;

1110

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * (int)dst_stride_y + offset.s3) = out_col1.s3;

1111

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * (int)dst_stride_y + offset.s3) = out_col2.s3;

1112

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * (int)dst_stride_y + offset.s3) = out_col3.s3;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1113

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1114

}

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1115

1116

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

1117

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW

1118

*

1119

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1120

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

1121

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1122

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1123

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1124

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1125

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1126

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1127

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1128

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1129

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1130

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1131

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1132

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1133

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1134

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1135

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1136

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1137

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1138

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1139

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1140

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1141

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1142

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1143

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1144

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1145

*/

1146

__kernel void winograd_output_transform_2x1_3x1_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1147

TENSOR4D_DECLARATION(src),

1148

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1149

#if defined(HAS_BIAS)

1150

,

1151

VECTOR_DECLARATION(bias)

1152

#endif // defined(HAS_BIAS)

1153

)

1154

{

1155

winograd_output_transform_2x2_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1162

src_stride_w,

1163

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1164

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1172

dst_stride_w,

1173

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1174

dst_offset_first_element_in_bytes

1175

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1181

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW

1186

*

1187

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1188

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1189

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1190

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1191

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1192

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1193

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1194

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1195

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1196

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1197

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1198

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1199

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1200

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1201

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1202

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1203

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1204

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1205

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1206

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1207

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1208

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1209

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1210

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1211

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1212

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1213

*/

1214

__kernel void winograd_output_transform_4x1_3x1_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1215

TENSOR4D_DECLARATION(src),

1216

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1217

#if defined(HAS_BIAS)

1218

,

1219

VECTOR_DECLARATION(bias)

1220

#endif // defined(HAS_BIAS)

1221

)

1222

{

1223

winograd_output_transform_4x4_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1230

src_stride_w,

1231

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1232

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1240

dst_stride_w,

1241

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1242

dst_offset_first_element_in_bytes

1243

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1249

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW

1254

*

1255

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1256

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1257

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1258

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1259

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1260

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1261

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1262

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1263

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1264

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1265

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1266

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1267

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1268

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1269

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1270

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1271

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1272

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1273

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1274

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1275

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1276

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1277

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1278

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1279

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1280

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1281

*/

1282

__kernel void winograd_output_transform_4x1_5x1_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1283

TENSOR4D_DECLARATION(src),

1284

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1285

#if defined(HAS_BIAS)

1286

,

1287

VECTOR_DECLARATION(bias)

1288

#endif // defined(HAS_BIAS)

1289

)

1290

{

1291

winograd_output_transform_4x4_5x5_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1298

src_stride_w,

1299

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1300

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1308

dst_stride_w,

1309

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1310

dst_offset_first_element_in_bytes

1311

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1317

#endif // defined(HAS_BIAS)

1318

);

1319

}

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1320

1321

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC

1322

*

1323

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1324

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1325

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1326

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1327

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1328

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1329

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1330

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1331

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1332

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1333

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1334

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1335

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1336

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1337

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1338

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1339

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1340

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1341

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1342

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1343

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1344

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1345

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1346

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1347

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1348

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1349

*/

1350

__kernel void winograd_output_transform_4x1_3x1_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1351

TENSOR4D_DECLARATION(src),

1352

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1353

#if defined(HAS_BIAS)

1354

VECTOR_DECLARATION(bias),

1355

#endif // defined(HAS_BIAS)

1356

int dst_size)

1357

{

1358

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1365

src_stride_w,

1366

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1367

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1375

dst_stride_w,

1376

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1377

dst_offset_first_element_in_bytes,

1378

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1383

#endif // defined(HAS_BIAS)

dst_size);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC

1388

*

1389

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1390

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1391

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1392

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1393

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1394

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1395

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1396

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1397

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1398

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1399

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1400

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1401

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1402

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1403

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1404

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1405

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1406

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1407

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1408

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1409

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1410

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1411

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1412

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1413

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1414

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1415

*/

1416

__kernel void winograd_output_transform_4x1_5x1_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1417

TENSOR4D_DECLARATION(src),

1418

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1419

#if defined(HAS_BIAS)

1420

VECTOR_DECLARATION(bias),

1421

#endif // defined(HAS_BIAS)

1422

int dst_size)

1423

{

1424

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1431

src_stride_w,

1432

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1433

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1441

dst_stride_w,

1442

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1443

dst_offset_first_element_in_bytes,

1444

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1449

#endif // defined(HAS_BIAS)

1450

dst_size);

1451

}

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1452

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

1453

1454

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1455

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW

1456

*

1457

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1458

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1459

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

1460

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1461

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1462

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1463

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1464

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1465

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1466

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1467

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1468

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1469

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1470

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1471

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1472

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1473

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1474

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1475

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1476

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1477

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1478

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1479

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1480

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1481

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1482

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1483

*/

1484

__kernel void winograd_output_transform_1x2_1x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1485

TENSOR4D_DECLARATION(src),

1486

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1487

#if defined(HAS_BIAS)

1488

,

1489

VECTOR_DECLARATION(bias)

1490

#endif // defined(HAS_BIAS)

1491

)

1492

{

1493

winograd_output_transform_2x2_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1500

src_stride_w,

1501

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1502

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1510

dst_stride_w,

1511

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1512

dst_offset_first_element_in_bytes

1513

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1519

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW

1524

*

1525

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1526

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1527

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1528

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1529

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1530

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1531

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1532

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1533

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1534

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1535

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1536

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1537

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1538

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1539

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1540

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1541

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1542

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1543

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1544

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1545

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1546

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1547

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1548

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1549

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1550

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1551

*/

1552

__kernel void winograd_output_transform_1x4_1x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1553

TENSOR4D_DECLARATION(src),

1554

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1555

#if defined(HAS_BIAS)

1556

,

1557

VECTOR_DECLARATION(bias)

1558

#endif // defined(HAS_BIAS)

1559

)

1560

{

1561

winograd_output_transform_4x4_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1568

src_stride_w,

1569

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1570

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1578

dst_stride_w,

1579

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1580

dst_offset_first_element_in_bytes

1581

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1587

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW

1592

*

1593

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1594

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1595

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1596

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1597

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1598

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1599

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1600

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1601

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1602

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1603

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1604

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1605

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1606

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1607

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1608

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1609

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1610

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1611

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1612

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1613

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1614

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1615

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1616

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1617

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1618

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1619

*/

1620

__kernel void winograd_output_transform_1x4_1x5_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1621

TENSOR4D_DECLARATION(src),

1622

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1623

#if defined(HAS_BIAS)

1624

,

1625

VECTOR_DECLARATION(bias)

1626

#endif // defined(HAS_BIAS)

1627

)

1628

{

1629

winograd_output_transform_4x4_5x5_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1636

src_stride_w,

1637

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1638

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1646

dst_stride_w,

1647

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1648

dst_offset_first_element_in_bytes

1649

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1655

#endif // defined(HAS_BIAS)

1656

);

1657

}

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1658

1659

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC

1660

*

1661

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1662

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1663

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1664

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1665

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1666

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1667

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1668

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1669

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1670

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1671

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1672

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1673

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1674

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1675

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1676

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1677

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1678

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1679

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1680

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1681

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1682

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1683

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1684

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1685

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1686

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1687

*/

1688

__kernel void winograd_output_transform_1x4_1x3_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1689

TENSOR4D_DECLARATION(src),

1690

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1691

#if defined(HAS_BIAS)

1692

VECTOR_DECLARATION(bias),

1693

#endif // defined(HAS_BIAS)

1694

int dst_size)

1695

{

1696

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1703

src_stride_w,

1704

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1705

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1713

dst_stride_w,

1714

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1715

dst_offset_first_element_in_bytes,

1716

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1721

#endif // defined(HAS_BIAS)

dst_size);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC

1726

*

1727

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1728

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1729

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1730

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1731

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1732

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1733

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1734

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1735

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1736

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1737

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1738

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1739

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1740

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1741

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1742

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1743

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1744

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1745

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1746

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1747

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1748

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1749

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1750

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1751

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1752

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1753

*/

1754

__kernel void winograd_output_transform_1x4_1x5_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1755

TENSOR4D_DECLARATION(src),

1756

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1757

#if defined(HAS_BIAS)

1758

VECTOR_DECLARATION(bias),

1759

#endif // defined(HAS_BIAS)

1760

int dst_size)

1761

{

1762

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1769

src_stride_w,

1770

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1771

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1779

dst_stride_w,

1780

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1781

dst_offset_first_element_in_bytes,

1782

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1787

#endif // defined(HAS_BIAS)

1788

dst_size);

1789

}

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1790

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Georgios Pinitas