Blame - src/core/CL/cl_kernels/winograd_output_transform.cl - ml/ComputeLibrary

2018-08-31 16:10:16 +0100

[diff] [blame]

37

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

38

* @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

39

* @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.

40

* @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

41

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

42

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

43

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

44

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

45

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

46

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

47

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

48

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

49

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

50

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

51

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

52

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

53

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

54

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

55

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

56

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

57

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

58

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

59

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

60

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

61

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

62

*/

63

__kernel void winograd_output_transform_2x2_3x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

64

TENSOR4D_DECLARATION(src),

65

TENSOR4D_DECLARATION(dst)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

66

#if defined(HAS_BIAS)

67

,

68

VECTOR_DECLARATION(bias)

69

#endif // defined(HAS_BIAS)

70

)

71

{

72

// Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

73

#if defined(SRC_DEPTH)

74

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

75

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

76

#else /* defined(SRC_DEPTH) */

77

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

78

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

79

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

80

81

// Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

82

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

83

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

84

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

85

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

86

87

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

88

// Compute the 2x1 or 1x2 output tile

89

// out00 = d00 + d01 + d02

90

// out01 = d01 - d02 - d03

91

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

92

float out00 = d00 + d01 + d02;

93

float out01 = d01 - d02 - d03;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

94

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

95

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

96

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

97

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

98

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

99

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

100

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

101

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

102

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

103

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

104

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

105

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

106

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

107

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

108

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

109

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

110

111

// Compute the 2x2 output tile

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

112

float k0 = d01 + d11 + d21;

113

float k1 = d02 + d12 + d22;

114

float k2 = d11 - d21 - d31;

115

float k3 = d12 - d22 - d32;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

116

117

// out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22

118

// out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)

119

// out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)

120

// out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)

121

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

float out00 = d10;

float out01 = -d13;

float out10 = d10;

float out11 = -d13;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

126

127

out00 += d00 + d20 + k0 + k1;

128

out01 += k0 - k1 - (d03 + d23);

129

out10 += -d20 - d30 + k2 + k3;

130

out11 += k2 - k3 + d23 + d33;

131

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

132

133

int y_in = get_global_id(1);

134

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

135

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

136

int z_out = get_global_id(0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

137

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

138

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

139

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

140

141

#if defined(HAS_BIAS)

142

// Add bias

143

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

144

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

145

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

146

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

147

out00 += (float)b;

148

out01 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

149

#endif // defined(HAS_BIAS)

150

151

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

152

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

153

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

154

#else /* defined(SRC_DEPTH) */

155

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

156

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

157

158

// Store the output tile

159

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Michalis Spyrou

ef6ec50

2020-07-31 11:38:36 +0100

[diff] [blame]

160

const VEC_DATA_TYPE(DATA_TYPE, 2)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

161

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

162

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;

163

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

164

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

165

vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

166

(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

167

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

168

169

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

170

#if defined(HAS_BIAS)

171

// Add bias

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

172

out10 += (DATA_TYPE)b;

173

out11 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

174

#endif // defined(HAS_BIAS)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

175

vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

176

(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

177

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

178

}

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

179

180

#define COMPUTE_TMP_COL_2x2_7x7(col, d0, d1, d2, d3, d4, d5, d6, d7) \

181

({ \

182

col.s0 = d0 + d1 + d2 + d3 + d4 + d5 + d6; \

183

col.s1 = -d1 + d2 - 2 * d3 + 2 * d4 + -3 * d5 + 3 * d6 + d7; \

184

})

185

186

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC

187

*

188

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

189

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

190

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

191

* @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

192

* @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

193

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

194

*

195

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

196

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

197

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

198

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

199

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

200

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

201

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

202

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

203

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

204

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

205

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

206

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

207

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

208

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

209

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

210

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

211

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

212

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

213

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

214

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

215

*/

216

__kernel void winograd_output_transform_2x2_7x7_nhwc(

217

TENSOR4D_DECLARATION(src),

218

TENSOR4D_DECLARATION(dst),

219

#if defined(HAS_BIAS)

220

VECTOR_DECLARATION(bias),

221

#endif // defined(HAS_BIAS)

222

int dst_size)

223

{

224

// Each thread stores a 4x4/4x1 or 1x4 tile

225

#if defined(SRC_DEPTH)

226

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

227

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

228

#else /* defined(SRC_DEPTH) */

229

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

230

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

231

#endif /* defined(SRC_DEPTH) */

232

233

int y_in = get_global_id(1);

234

int x_out = get_global_id(0);

235

int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

236

int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

237

#if defined(SRC_DEPTH)

238

int batch = get_global_id(2) / SRC_DEPTH;

239

#endif /* defined(SRC_DEPTH) */

240

241

#if defined(SRC_DEPTH)

242

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

243

#else /* defined(SRC_DEPTH) */

244

245

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

246

#endif /* defined(SRC_DEPTH) */

247

248

// Load the values across the channels to compose the input tile

249

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

250

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

251

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

252

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

253

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

254

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

255

DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

256

DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

257

258

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

259

// Compute out00, out01, out02 and out03

260

float out00 = d00 + d01 + d02 + d03 + d04 + d05 + d06;

261

float out01 = -d01 + d02 - 2.f * d03 + 2.0f * d04 - 3.0f * d05 + 3.0f * d06 + d07;

262

263

#if defined(HAS_BIAS)

264

// Add bias

265

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

266

267

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

out00 += (float)b;

out01 += (float)b;

#endif // defined(HAS_BIAS)

272

273

// Store the output tile

274

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

275

// Get output address

276

#if defined(SRC_DEPTH)

277

int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

278

#else /* defined(SRC_DEPTH) */

279

int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

280

#endif /* defined(SRC_DEPTH) */

281

offset = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

282

283

VEC_DATA_TYPE(DATA_TYPE, 2)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

284

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

285

*(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;

286

*(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;

287

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

288

// Get output address

289

int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

290

VEC_DATA_TYPE(DATA_TYPE, 2)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

291

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL,

292

B_VAL);

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

293

*(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;

294

*(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;

295

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

296

297

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

298

299

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

300

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

301

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

302

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

303

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

304

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

305

DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

306

DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

307

308

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

309

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

310

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

311

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

312

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

313

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

314

DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

315

DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

316

317

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

318

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

319

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

320

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

321

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

322

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

323

DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

324

DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

325

326

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

327

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

328

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

329

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

330

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));

331

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));

332

DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));

333

DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));

334

335

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));

336

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));

337

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));

338

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));

339

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));

340

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));

341

DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));

342

DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));

343

344

DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));

345

DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));

346

DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));

347

DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));

348

DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));

349

DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));

350

DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));

351

DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));

352

353

DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));

354

DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));

355

DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));

356

DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));

357

DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));

358

DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));

359

DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));

360

DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));

361

362

// Compute the 8x2 intermediate tensor

363

VEC_DATA_TYPE(float, 2)

364

tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;

365

366

COMPUTE_TMP_COL_2x2_7x7(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70);

367

COMPUTE_TMP_COL_2x2_7x7(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71);

368

COMPUTE_TMP_COL_2x2_7x7(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72);

369

COMPUTE_TMP_COL_2x2_7x7(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73);

370

COMPUTE_TMP_COL_2x2_7x7(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74);

371

COMPUTE_TMP_COL_2x2_7x7(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75);

372

COMPUTE_TMP_COL_2x2_7x7(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76);

373

COMPUTE_TMP_COL_2x2_7x7(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77);

374

375

// Compute the 2x2 output tile

376

VEC_DATA_TYPE(float, 2)

377

out_col0 = tmp_col0 + tmp_col1 + tmp_col2 + tmp_col3 + tmp_col4 + tmp_col5 + tmp_col6;

378

VEC_DATA_TYPE(float, 2)

379

out_col1 = -tmp_col1 + tmp_col2 - 2 * tmp_col3 + 2 * tmp_col4 - 3 * tmp_col5 + 3 * tmp_col6 + tmp_col7;

380

381

#if defined(HAS_BIAS)

382

// Add bias

383

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

384

385

DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

386

387

out_col0 += (VEC_DATA_TYPE(float, 2))b;

388

out_col1 += (VEC_DATA_TYPE(float, 2))b;

389

390

#endif // defined(HAS_BIAS)

391

// Get output address

392

#if defined(SRC_DEPTH)

393

int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

394

#else /* defined(SRC_DEPTH) */

395

int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

396

#endif /* defined(SRC_DEPTH) */

397

offset = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

398

int2 mult_y = min((int2)dst_size - offset, (int2)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.

399

400

// Store the output tile

401

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

402

out_col0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

403

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

404

out_col1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

405

406

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;

407

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;

408

409

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0_dt.s1;

410

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;

411

412

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

413

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

414

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

415

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

416

#if defined(VEC_SIZE) && VEC_SIZE == 4

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

417

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW

418

*

419

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

420

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

421

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

422

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

423

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

424

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

425

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

426

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

427

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

428

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

429

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

430

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

431

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

432

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

433

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

434

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

435

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

436

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

437

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

438

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

439

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

440

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

441

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

442

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

443

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

444

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

445

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

446

*/

447

__kernel void winograd_output_transform_4x4_3x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

448

TENSOR4D_DECLARATION(src),

449

TENSOR4D_DECLARATION(dst)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

450

#if defined(HAS_BIAS)

451

,

452

VECTOR_DECLARATION(bias)

453

#endif // defined(HAS_BIAS)

454

)

455

{

456

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

457

#if defined(SRC_DEPTH)

458

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

459

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

460

#else /* defined(SRC_DEPTH) */

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

461

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

462

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

463

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

464

465

// Load the values across the channels to compose the 6x6 or 6x1 tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

466

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

467

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

468

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

469

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

470

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

471

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

472

473

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

474

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

475

float out00 = d00 + d01 + d02 + d03 + d04;

476

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;

477

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;

478

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

479

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

480

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

481

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

482

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

483

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

484

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

485

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

486

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

487

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

488

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

489

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

490

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

491

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

492

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

493

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

494

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

495

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

496

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

497

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

498

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

499

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

500

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

501

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

502

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

503

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

504

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

505

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

506

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

507

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

508

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

509

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

510

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

511

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

512

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

513

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

514

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

515

516

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

517

float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;

518

float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;

519

float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;

520

float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

521

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

522

float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;

523

float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

524

525

out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;

526

out01 += k1 - d02 - d12 - d22 - d32 - d42;

527

out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;

528

out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;

529

530

// Compute out10, out11, out12 and out13

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

531

float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

532

float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

533

float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

534

float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

535

536

k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;

537

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;

538

539

out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;

540

out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;

541

out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;

542

out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;

543

544

// Compute out20, out21, out22 and out23

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

545

float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

546

float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

547

float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

548

float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

549

550

k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;

551

k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;

552

553

out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;

554

out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;

555

out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;

556

out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;

557

558

// Compute out30, out31, out32 and out33

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

559

float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

560

float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

561

float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

562

float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

563

564

k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;

565

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;

566

567

out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;

568

out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;

569

out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;

570

out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;

571

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

572

573

int y_in = get_global_id(1);

574

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

575

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

576

int z_out = get_global_id(0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

577

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

578

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

579

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

580

581

#if defined(HAS_BIAS)

582

// Add bias

583

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

584

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

585

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

586

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out00 += (float)b;

out01 += (float)b;

out02 += (float)b;

out03 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

591

#endif // defined(HAS_BIAS)

592

593

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

594

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

595

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

596

#else /* defined(SRC_DEPTH) */

597

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

598

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

599

600

// Store the output tile

601

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

602

VEC_DATA_TYPE(DATA_TYPE, 4)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

603

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,

604

B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

605

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;

606

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;

607

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;

608

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

609

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

610

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

611

(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

612

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

613

614

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

615

#if defined(HAS_BIAS)

616

// Add bias

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out10 += (float)b;

out11 += (float)b;

out12 += (float)b;

out13 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

621

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out20 += (float)b;

out21 += (float)b;

out22 += (float)b;

out23 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

626

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out30 += (float)b;

out31 += (float)b;

out32 += (float)b;

out33 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

631

#endif // defined(HAS_BIAS)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

632

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

633

(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

634

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

635

(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

636

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

637

(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

638

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

639

}

640

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

641

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

642

*

643

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

644

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

645

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

646

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

647

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

648

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

649

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

650

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

651

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

652

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

653

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

654

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

655

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

656

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

657

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

658

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

659

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

660

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

661

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

662

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

663

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

664

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

665

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

666

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

667

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

668

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

669

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

670

* @param[in] dst_size Size of the destination tensor, minus the last padding

671

*/

672

__kernel void winograd_output_transform_4x4_3x3_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

673

TENSOR4D_DECLARATION(src),

674

TENSOR4D_DECLARATION(dst),

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

675

#if defined(HAS_BIAS)

676

VECTOR_DECLARATION(bias),

677

#endif // defined(HAS_BIAS)

678

int dst_size)

679

{

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

680

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

681

#if defined(SRC_DEPTH)

682

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

683

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

684

#else /* defined(SRC_DEPTH) */

685

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

686

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

687

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

688

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

689

// Load the values across the 36 channels to compose the 6x6 or 6x1 tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

690

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

691

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

692

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

693

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

694

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

695

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

696

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

697

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

698

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

699

float out00 = d00 + d01 + d02 + d03 + d04;

700

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;

701

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;

702

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

703

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

704

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

705

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

706

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

707

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

708

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

709

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

710

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

711

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

712

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

713

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

714

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

715

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

716

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

717

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

718

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

719

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

720

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

721

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

722

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

723

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

724

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

725

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

726

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

727

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

728

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

729

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

730

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

731

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

732

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

733

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

734

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

735

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

736

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

737

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

738

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

739

740

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

741

float out00 = d01 + d21 + d41 + d11 + d31;

742

float out01 = d01 + d21 + d41 + d11 + d31;

743

float out02 = d01 + d21 + d41 + d11 + d31;

744

float out03 = d01 + d21 + d41 + d11 + d31;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

745

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

746

float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;

747

float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

748

749

out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;

750

out01 += k1 - d02 - d12 - d22 - d32 - d42;

751

out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;

752

out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;

753

754

// Compute out10, out11, out12 and out13

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

755

float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

756

float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

757

float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

758

float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

759

760

k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;

761

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;

762

763

out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;

764

out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;

765

out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;

766

out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;

767

768

// Compute out20, out21, out22 and out23

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

769

float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

770

float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

771

float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

772

float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

773

774

k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;

775

k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;

776

777

out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;

778

out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;

779

out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;

780

out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;

781

782

// Compute out30, out31, out32 and out33

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

783

float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

784

float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

785

float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

786

float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

787

788

k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;

789

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;

790

791

out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;

792

out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;

793

out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;

794

out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

795

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

796

797

int y_in = get_global_id(1);

798

int x_out = get_global_id(0);

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

799

int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

800

int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

801

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

802

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

803

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

804

805

#if defined(HAS_BIAS)

806

// Add bias

807

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

808

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

809

DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

810

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

811

out00 += (DATA_TYPE)b;

812

out01 += (DATA_TYPE)b;

813

out02 += (DATA_TYPE)b;

814

out03 += (DATA_TYPE)b;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

815

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

816

out10 += (DATA_TYPE)b;

817

out11 += (DATA_TYPE)b;

818

out12 += (DATA_TYPE)b;

819

out13 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

820

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

821

out20 += (DATA_TYPE)b;

822

out21 += (DATA_TYPE)b;

823

out22 += (DATA_TYPE)b;

824

out23 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

825

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

826

out30 += (DATA_TYPE)b;

827

out31 += (DATA_TYPE)b;

828

out32 += (DATA_TYPE)b;

829

out33 += (DATA_TYPE)b;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

830

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

831

832

#endif // defined(HAS_BIAS)

833

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

834

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

835

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

836

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

837

#else /* defined(SRC_DEPTH) */

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

838

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

839

#endif /* defined(SRC_DEPTH) */

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

840

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

841

842

// Store the 1x4 output tile

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

843

VEC_DATA_TYPE(DATA_TYPE, 4)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

844

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,

845

B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

846

*((__global DATA_TYPE *)(dst_ptr + offset.s0)) = out0_dt.s0;

847

*((__global DATA_TYPE *)(dst_ptr + offset.s1)) = out0_dt.s1;

848

*((__global DATA_TYPE *)(dst_ptr + offset.s2)) = out0_dt.s2;

849

*((__global DATA_TYPE *)(dst_ptr + offset.s3)) = out0_dt.s3;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

850

#elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

851

// Store the 4x1 output tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

852

int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

Giorgio Arena

d02eb45

2018-07-18 11:45:30 +0100

[diff] [blame]

853

int mult_y = min(dst_size - offset, 1);

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

854

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

855

VEC_DATA_TYPE(DATA_TYPE, 4)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

856

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)),

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

857

A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

858

*((__global DATA_TYPE *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = out0_dt.s0;

859

*((__global DATA_TYPE *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = out0_dt.s1;

860

*((__global DATA_TYPE *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = out0_dt.s2;

861

*((__global DATA_TYPE *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = out0_dt.s3;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

862

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

863

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

864

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

865

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

866

#else /* defined(SRC_DEPTH) */

867

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

868

#endif /* defined(SRC_DEPTH) */

Georgios Pinitas

b0b3717

2018-07-20 17:30:56 +0100

[diff] [blame]

869

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

870

int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

871

872

// Store the 4x4 output tile

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

873

VEC_DATA_TYPE(DATA_TYPE, 4)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

874

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

875

VEC_DATA_TYPE(DATA_TYPE, 4)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

876

out1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

877

VEC_DATA_TYPE(DATA_TYPE, 4)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

878

out2_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

879

VEC_DATA_TYPE(DATA_TYPE, 4)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

880

out3_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33),

881

VEC_DATA_TYPE(DATA_TYPE, 4)),

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

882

A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

883

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out0_dt.s0;

884

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out0_dt.s1;

885

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = out0_dt.s2;

886

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = out0_dt.s3;

887

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = out1_dt.s0;

888

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = out1_dt.s1;

889

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = out1_dt.s2;

890

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = out1_dt.s3;

891

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = out2_dt.s0;

892

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = out2_dt.s1;

893

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = out2_dt.s2;

894

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = out2_dt.s3;

895

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = out3_dt.s0;

896

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out3_dt.s1;

897

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out3_dt.s2;

898

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out3_dt.s3;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

899

900

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

901

}

902

903

#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \

904

({ \

905

comm_fact.s0 = d1 + d2; \

906

comm_fact.s1 = d3 + d4; \

907

comm_fact.s2 = d5 + d6; \

908

\

909

col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \

910

col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \

911

\

912

comm_fact.s0 = d1 - d2; \

913

comm_fact.s1 = d3 - d4; \

914

comm_fact.s2 = d5 - d6; \

915

\

916

col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \

917

col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \

918

})

919

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

920

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

921

*

922

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

923

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

924

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

925

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

926

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

927

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

928

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

929

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

930

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

931

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

932

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

933

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

934

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

935

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

936

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

937

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

938

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

939

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

940

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

941

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

942

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

943

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

944

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

945

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

946

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

947

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

948

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

949

*/

950

__kernel void winograd_output_transform_4x4_5x5_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

951

TENSOR4D_DECLARATION(src),

952

TENSOR4D_DECLARATION(dst)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

953

#if defined(HAS_BIAS)

954

,

955

VECTOR_DECLARATION(bias)

956

#endif // defined(HAS_BIAS)

957

)

958

{

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

959

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

960

#if defined(SRC_DEPTH)

961

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

962

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

963

#else /* defined(SRC_DEPTH) */

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

964

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

965

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

966

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

967

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

968

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

969

// Compute output address

970

int y_in = get_global_id(1);

971

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

972

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

973

int z_out = get_global_id(0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

974

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

975

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

976

#endif /* defined(SRC_DEPTH) */

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

977

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

978

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

979

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

980

#else /* defined(SRC_DEPTH) */

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

981

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

982

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

983

#endif /* defined(SRC_DEPTH) */

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

984

985

// Load the values across the channels to compose the input tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

986

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

987

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

988

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

989

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

990

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

991

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

992

DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

993

DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

994

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

995

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

996

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

997

float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;

998

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;

999

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;

1000

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1001

1002

#if defined(HAS_BIAS)

1003

// Add bias

1004

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

1005

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1006

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1007

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1008

out00 += (DATA_TYPE)b;

1009

out01 += (DATA_TYPE)b;

1010

out02 += (DATA_TYPE)b;

1011

out03 += (DATA_TYPE)b;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1012

#endif // defined(HAS_BIAS)

1013

1014

// Store the output tile

1015

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1016

VEC_DATA_TYPE(DATA_TYPE, 4)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1017

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,

1018

B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1019

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;

1020

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;

1021

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;

1022

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1023

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1024

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

1025

(__global DATA_TYPE *)(dst_addr));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1026

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1027

1028

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1029

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1030

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

1031

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

1032

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

1033

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

1034

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

1035

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

1036

DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

1037

DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1038

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1039

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

1040

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

1041

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

1042

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

1043

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

1044

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

1045

DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

1046

DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1047

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1048

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

1049

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

1050

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

1051

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

1052

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

1053

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

1054

DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

1055

DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1056

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1057

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

1058

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

1059

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

1060

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

1061

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));

1062

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));

1063

DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));

1064

DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1065

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1066

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));

1067

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));

1068

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));

1069

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));

1070

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));

1071

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));

1072

DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));

1073

DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1074

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1075

DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));

1076

DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));

1077

DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));

1078

DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));

1079

DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));

1080

DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));

1081

DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));

1082

DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1083

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1084

DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));

1085

DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));

1086

DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));

1087

DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));

1088

DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));

1089

DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));

1090

DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));

1091

DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1092

1093

// Compute the 8x4 intermediate tensor

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1094

VEC_DATA_TYPE(float, 4)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1095

comm_fact0, comm_fact1, comm_fact2;

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1096

VEC_DATA_TYPE(float, 4)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1097

tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1098

1099

COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);

1100

COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);

1101

COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);

1102

COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);

1103

COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);

1104

COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);

1105

COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);

1106

COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);

1107

1108

// Compute the 4x4 output tile

1109

comm_fact0 = tmp_col1 + tmp_col2;

1110

comm_fact1 = tmp_col3 + tmp_col4;

1111

comm_fact2 = tmp_col5 + tmp_col6;

1112

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1113

VEC_DATA_TYPE(float, 4)

1114

out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0;

1115

VEC_DATA_TYPE(float, 4)

1116

out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1117

1118

comm_fact0 = tmp_col1 - tmp_col2;

1119

comm_fact1 = tmp_col3 - tmp_col4;

1120

comm_fact2 = tmp_col5 - tmp_col6;

1121

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1122

VEC_DATA_TYPE(float, 4)

1123

out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2;

1124

VEC_DATA_TYPE(float, 4)

1125

out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1126

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1127

#if defined(HAS_BIAS)

1128

// Add bias

1129

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

1130

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1131

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1132

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1133

out_col0 += (VEC_DATA_TYPE(float, 4))b;

1134

out_col1 += (VEC_DATA_TYPE(float, 4))b;

1135

out_col2 += (VEC_DATA_TYPE(float, 4))b;

1136

out_col3 += (VEC_DATA_TYPE(float, 4))b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1137

#endif // defined(HAS_BIAS)

1138

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1139

// Store the output tile

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1140

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

1141

(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1142

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

1143

(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1144

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

1145

(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1146

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

1147

(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1148

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1149

}

1150

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1151

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1152

*

1153

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1154

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1155

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1156

* @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1157

* @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1158

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1159

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1160

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1161

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1162

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1163

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1164

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1165

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1166

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1167

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1168

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1169

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1170

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1171

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1172

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1173

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1174

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1175

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1176

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1177

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1178

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1179

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1180

*/

1181

__kernel void winograd_output_transform_4x4_5x5_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1182

TENSOR4D_DECLARATION(src),

1183

TENSOR4D_DECLARATION(dst),

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1184

#if defined(HAS_BIAS)

1185

VECTOR_DECLARATION(bias),

1186

#endif // defined(HAS_BIAS)

1187

int dst_size)

1188

{

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1189

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1190

#if defined(SRC_DEPTH)

1191

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1192

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1193

#else /* defined(SRC_DEPTH) */

1194

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

1195

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

1196

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1197

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1198

int y_in = get_global_id(1);

1199

int x_out = get_global_id(0);

1200

int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

1201

int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1202

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1203

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1204

#endif /* defined(SRC_DEPTH) */

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1205

1206

// Load the values across the channels to compose the input tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1207

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

1208

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

1209

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

1210

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

1211

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

1212

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

1213

DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

1214

DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1215

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1216

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1217

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1218

float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;

1219

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;

1220

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;

1221

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1222

1223

#if defined(HAS_BIAS)

1224

// Add bias

1225

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

1226

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1227

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1228

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out00 += (float)b;

out01 += (float)b;

out02 += (float)b;

out03 += (float)b;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1233

#endif // defined(HAS_BIAS)

1234

1235

// Store the output tile

1236

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1237

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1238

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1239

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1240

#else /* defined(SRC_DEPTH) */

1241

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

1242

#endif /* defined(SRC_DEPTH) */

1243

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1244

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1245

VEC_DATA_TYPE(DATA_TYPE, 4)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1246

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,

1247

B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1248

*(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;

1249

*(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;

1250

*(__global DATA_TYPE *)(dst_ptr + offset.s2) = out0_dt.s2;

1251

*(__global DATA_TYPE *)(dst_ptr + offset.s3) = out0_dt.s3;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1252

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1253

// Get output address

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1254

int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1255

VEC_DATA_TYPE(DATA_TYPE, 4)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1256

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

1257

B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1258

*(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;

1259

*(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;

1260

*(__global DATA_TYPE *)(dst_ptr + 2 * dst_stride_y + offset) = out0_dt.s2;

1261

*(__global DATA_TYPE *)(dst_ptr + 3 * dst_stride_y + offset) = out0_dt.s3;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1262

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1263

1264

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1265

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1266

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

1267

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

1268

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

1269

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

1270

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

1271

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

1272

DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

1273

DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1274

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1275

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

1276

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

1277

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

1278

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

1279

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

1280

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

1281

DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

1282

DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1283

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1284

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

1285

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

1286

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

1287

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

1288

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

1289

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

1290

DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

1291

DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1292

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1293

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

1294

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

1295

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

1296

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

1297

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));

1298

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));

1299

DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));

1300

DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1301

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1302

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));

1303

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));

1304

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));

1305

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));

1306

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));

1307

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));

1308

DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));

1309

DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1310

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1311

DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));

1312

DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));

1313

DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));

1314

DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));

1315

DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));

1316

DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));

1317

DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));

1318

DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1319

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1320

DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));

1321

DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));

1322

DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));

1323

DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));

1324

DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));

1325

DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));

1326

DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));

1327

DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1328

1329

// Compute the 8x4 intermediate tensor

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1330

VEC_DATA_TYPE(float, 4)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1331

comm_fact0, comm_fact1, comm_fact2;

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1332

VEC_DATA_TYPE(float, 4)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1333

tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1334

1335

COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);

1336

COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);

1337

COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);

1338

COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);

1339

COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);

1340

COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);

1341

COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);

1342

COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);

1343

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1344

// Compute the output tile

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1345

comm_fact0 = tmp_col1 + tmp_col2;

1346

comm_fact1 = tmp_col3 + tmp_col4;

1347

comm_fact2 = tmp_col5 + tmp_col6;

1348

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1349

VEC_DATA_TYPE(float, 4)

1350

out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;

1351

VEC_DATA_TYPE(float, 4)

1352

out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1353

1354

comm_fact0 = tmp_col1 - tmp_col2;

1355

comm_fact1 = tmp_col3 - tmp_col4;

1356

comm_fact2 = tmp_col5 - tmp_col6;

1357

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1358

VEC_DATA_TYPE(float, 4)

1359

out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;

1360

VEC_DATA_TYPE(float, 4)

1361

out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1362

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1363

#if defined(HAS_BIAS)

1364

// Add bias

1365

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

1366

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1367

DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1368

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1369

out_col0 += (VEC_DATA_TYPE(float, 4))b;

1370

out_col1 += (VEC_DATA_TYPE(float, 4))b;

1371

out_col2 += (VEC_DATA_TYPE(float, 4))b;

1372

out_col3 += (VEC_DATA_TYPE(float, 4))b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1373

#endif // defined(HAS_BIAS)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1374

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1375

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1376

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1377

#else /* defined(SRC_DEPTH) */

1378

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

1379

#endif /* defined(SRC_DEPTH) */

Georgios Pinitas

b0b3717

2018-07-20 17:30:56 +0100

[diff] [blame]

1380

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

1381

int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1382

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1383

// Store the output tile

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1384

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1385

out_col0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1386

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1387

out_col1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1388

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1389

out_col2_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col2, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1390

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1391

out_col3_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col3, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1392

1393

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;

1394

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;

1395

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * (int)dst_stride_y + offset.s0) = out_col2_dt.s0;

1396

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * (int)dst_stride_y + offset.s0) = out_col3_dt.s0;

1397

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0_dt.s1;

1398

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;

1399

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * (int)dst_stride_y + offset.s1) = out_col2_dt.s1;

1400

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * (int)dst_stride_y + offset.s1) = out_col3_dt.s1;

1401

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * (int)dst_stride_y + offset.s2) = out_col0_dt.s2;

1402

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * (int)dst_stride_y + offset.s2) = out_col1_dt.s2;

1403

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * (int)dst_stride_y + offset.s2) = out_col2_dt.s2;

1404

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * (int)dst_stride_y + offset.s2) = out_col3_dt.s2;

1405

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * (int)dst_stride_y + offset.s3) = out_col0_dt.s3;

1406

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * (int)dst_stride_y + offset.s3) = out_col1_dt.s3;

1407

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * (int)dst_stride_y + offset.s3) = out_col2_dt.s3;

1408

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * (int)dst_stride_y + offset.s3) = out_col3_dt.s3;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1409

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1410

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1411

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1412

1413

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1414

#if defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1415

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW

1416

*

1417

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1418

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

1419

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1420

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1421

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1422

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1423

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1424

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1425

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1426

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1427

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1428

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1429

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1430

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1431

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1432

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1433

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1434

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1435

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1436

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1437

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1438

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1439

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1440

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1441

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1442

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1443

*/

1444

__kernel void winograd_output_transform_2x1_3x1_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1445

TENSOR4D_DECLARATION(src),

1446

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1447

#if defined(HAS_BIAS)

1448

,

1449

VECTOR_DECLARATION(bias)

1450

#endif // defined(HAS_BIAS)

1451

)

1452

{

1453

winograd_output_transform_2x2_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1460

src_stride_w,

1461

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1462

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1470

dst_stride_w,

1471

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1472

dst_offset_first_element_in_bytes

1473

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1479

#endif // defined(HAS_BIAS)

1480

);

1481

}

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

1482

1483

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC

1484

*

1485

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1486

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

1487

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1488

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1489

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

1490

*

1491

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

1492

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1493

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1494

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1495

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1496

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1497

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1498

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1499

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1500

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1501

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1502

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1503

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1504

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1505

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1506

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1507

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1508

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1509

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1510

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1511

*/

1512

__kernel void winograd_output_transform_2x1_7x1_nhwc(

1513

TENSOR4D_DECLARATION(src),

1514

TENSOR4D_DECLARATION(dst),

1515

#if defined(HAS_BIAS)

1516

VECTOR_DECLARATION(bias),

1517

#endif // defined(HAS_BIAS)

1518

int dst_size)

1519

{

1520

winograd_output_transform_2x2_7x7_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

1540

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1545

#endif // defined(HAS_BIAS)

1546

dst_size);

1547

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1548

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1549

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1550

#if defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1551

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW

1552

*

1553

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1554

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1555

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1556

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1557

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1558

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1559

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1560

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1561

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1562

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1563

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1564

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1565

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1566

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1567

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1568

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1569

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1570

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1571

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1572

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1573

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1574

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1575

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1576

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1577

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1578

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1579

*/

1580

__kernel void winograd_output_transform_4x1_3x1_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1581

TENSOR4D_DECLARATION(src),

1582

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1583

#if defined(HAS_BIAS)

1584

,

1585

VECTOR_DECLARATION(bias)

1586

#endif // defined(HAS_BIAS)

1587

)

1588

{

1589

winograd_output_transform_4x4_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1596

src_stride_w,

1597

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1598

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1606

dst_stride_w,

1607

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1608

dst_offset_first_element_in_bytes

1609

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1615

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW

1620

*

1621

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1622

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1623

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1624

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1625

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1626

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1627

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1628

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1629

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1630

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1631

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1632

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1633

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1634

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1635

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1636

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1637

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1638

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1639

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1640

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1641

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1642

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1643

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1644

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1645

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1646

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1647

*/

1648

__kernel void winograd_output_transform_4x1_5x1_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1649

TENSOR4D_DECLARATION(src),

1650

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1651

#if defined(HAS_BIAS)

1652

,

1653

VECTOR_DECLARATION(bias)

1654

#endif // defined(HAS_BIAS)

1655

)

1656

{

1657

winograd_output_transform_4x4_5x5_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1664

src_stride_w,

1665

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1666

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1674

dst_stride_w,

1675

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1676

dst_offset_first_element_in_bytes

1677

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1683

#endif // defined(HAS_BIAS)

1684

);

1685

}

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1686

1687

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC

1688

*

1689

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1690

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1691

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1692

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1693

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1694

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1695

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1696

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1697

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1698

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1699

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1700

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1701

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1702

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1703

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1704

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1705

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1706

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1707

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1708

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1709

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1710

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1711

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1712

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1713

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1714

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1715

*/

1716

__kernel void winograd_output_transform_4x1_3x1_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1717

TENSOR4D_DECLARATION(src),

1718

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1719

#if defined(HAS_BIAS)

1720

VECTOR_DECLARATION(bias),

1721

#endif // defined(HAS_BIAS)

1722

int dst_size)

1723

{

1724

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1731

src_stride_w,

1732

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1733

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1741

dst_stride_w,

1742

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1743

dst_offset_first_element_in_bytes,

1744

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1749

#endif // defined(HAS_BIAS)

dst_size);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC

1754

*

1755

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1756

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1757

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1758

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1759

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1760

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1761

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1762

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1763

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1764

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1765

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1766

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1767

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1768

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1769

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1770

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1771

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1772

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1773

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1774

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1775

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1776

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1777

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1778

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1779

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1780

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1781

*/

1782

__kernel void winograd_output_transform_4x1_5x1_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1783

TENSOR4D_DECLARATION(src),

1784

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1785

#if defined(HAS_BIAS)

1786

VECTOR_DECLARATION(bias),

1787

#endif // defined(HAS_BIAS)

1788

int dst_size)

1789

{

1790

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1797

src_stride_w,

1798

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1799

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1807

dst_stride_w,

1808

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1809

dst_offset_first_element_in_bytes,

1810

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1815

#endif // defined(HAS_BIAS)

1816

dst_size);

1817

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1818

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1819

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

1820

1821

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1822

#if defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1823

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW

1824

*

1825

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1826

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1827

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

1828

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1829

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1830

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1831

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1832

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1833

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1834

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1835

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1836

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1837

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1838

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1839

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1840

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1841

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1842

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1843

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1844

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1845

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1846

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1847

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1848

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1849

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1850

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1851

*/

1852

__kernel void winograd_output_transform_1x2_1x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1853

TENSOR4D_DECLARATION(src),

1854

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1855

#if defined(HAS_BIAS)

1856

,

1857

VECTOR_DECLARATION(bias)

1858

#endif // defined(HAS_BIAS)

1859

)

1860

{

1861

winograd_output_transform_2x2_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1868

src_stride_w,

1869

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1870

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1878

dst_stride_w,

1879

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1880

dst_offset_first_element_in_bytes

1881

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1887

#endif // defined(HAS_BIAS)

1888

);

1889

}

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

1890

1891

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC

1892

*

1893

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1894

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1895

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

1896

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

1897

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

1898

*

1899

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

1900

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1901

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1902

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1903

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1904

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1905

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1906

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1907

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1908

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1909

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1910

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1911

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1912

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1913

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1914

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1915

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1916

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1917

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1918

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1919

*/

1920

__kernel void winograd_output_transform_1x2_1x7_nhwc(

1921

TENSOR4D_DECLARATION(src),

1922

TENSOR4D_DECLARATION(dst),

1923

#if defined(HAS_BIAS)

1924

VECTOR_DECLARATION(bias),

1925

#endif // defined(HAS_BIAS)

1926

int dst_size)

1927

{

1928

winograd_output_transform_2x2_7x7_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

1948

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1953

#endif // defined(HAS_BIAS)

1954

dst_size);

1955

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1956

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1957

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1958

#if defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1959

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW

1960

*

1961

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1962

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1963

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1964

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1965

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1966

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1967

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1968

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1969

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1970

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1971

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1972

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1973

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1974

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1975

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1976

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1977

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1978

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1979

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1980

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1981

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1982

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1983

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1984

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1985

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1986

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1987

*/

1988

__kernel void winograd_output_transform_1x4_1x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1989

TENSOR4D_DECLARATION(src),

1990

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1991

#if defined(HAS_BIAS)

1992

,

1993

VECTOR_DECLARATION(bias)

1994

#endif // defined(HAS_BIAS)

1995

)

1996

{

1997

winograd_output_transform_4x4_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2004

src_stride_w,

2005

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2006

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2014

dst_stride_w,

2015

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2016

dst_offset_first_element_in_bytes

2017

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

2023

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW

2028

*

2029

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

2030

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

2031

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

2032

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2033

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2034

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2035

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2036

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2037

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2038

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2039

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2040

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2041

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2042

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2043

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2044

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

2045

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

2046

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2047

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2048

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2049

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2050

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2051

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

2052

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

2053

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2054

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2055

*/

2056

__kernel void winograd_output_transform_1x4_1x5_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2057

TENSOR4D_DECLARATION(src),

2058

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2059

#if defined(HAS_BIAS)

2060

,

2061

VECTOR_DECLARATION(bias)

2062

#endif // defined(HAS_BIAS)

2063

)

2064

{

2065

winograd_output_transform_4x4_5x5_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2072

src_stride_w,

2073

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2074

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2082

dst_stride_w,

2083

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2084

dst_offset_first_element_in_bytes

2085

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

2091

#endif // defined(HAS_BIAS)

2092

);

2093

}

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2094

2095

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC

2096

*

2097

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

2098

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

2099

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

2100

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2101

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2102

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2103

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2104

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2105

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2106

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2107

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2108

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2109

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2110

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2111

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2112

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

2113

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

2114

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2115

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2116

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2117

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2118

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2119

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

2120

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

2121

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2122

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2123

*/

2124

__kernel void winograd_output_transform_1x4_1x3_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2125

TENSOR4D_DECLARATION(src),

2126

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2127

#if defined(HAS_BIAS)

2128

VECTOR_DECLARATION(bias),

2129

#endif // defined(HAS_BIAS)

2130

int dst_size)

2131

{

2132

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2139

src_stride_w,

2140

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2141

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2149

dst_stride_w,

2150

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2151

dst_offset_first_element_in_bytes,

2152

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

2157

#endif // defined(HAS_BIAS)

dst_size);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC

2162

*

2163

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

2164

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

2165

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

2166

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2167

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2168

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2169

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2170

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2171

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2172

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2173

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2174

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2175

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2176

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2177

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2178

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

2179

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

2180

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2181

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2182

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2183

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2184

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2185

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

2186

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

2187

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2188

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2189

*/

2190

__kernel void winograd_output_transform_1x4_1x5_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2191

TENSOR4D_DECLARATION(src),

2192

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2193

#if defined(HAS_BIAS)

2194

VECTOR_DECLARATION(bias),

2195

#endif // defined(HAS_BIAS)

2196

int dst_size)

2197

{

2198

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2205

src_stride_w,

2206

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2207

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2215

dst_stride_w,

2216

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2217

dst_offset_first_element_in_bytes,

2218

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

2223

#endif // defined(HAS_BIAS)

2224

dst_size);

2225

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

2226

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2227

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Georgios Pinitas