Blame - src/core/CL/cl_kernels/winograd_output_transform.cl - ml/ComputeLibrary

2018-08-31 16:10:16 +0100

[diff] [blame]

37

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

38

* @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

39

* @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.

40

* @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

41

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

42

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

43

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

44

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

45

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

46

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

47

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

48

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

49

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

50

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

51

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

52

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

53

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

54

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

55

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

56

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

57

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

58

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

59

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

60

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

61

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

62

*/

63

__kernel void winograd_output_transform_2x2_3x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

64

TENSOR4D_DECLARATION(src),

65

TENSOR4D_DECLARATION(dst)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

66

#if defined(HAS_BIAS)

67

,

68

VECTOR_DECLARATION(bias)

69

#endif // defined(HAS_BIAS)

70

)

71

{

72

// Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

73

#if defined(SRC_DEPTH)

74

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

75

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

76

#else /* defined(SRC_DEPTH) */

77

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

78

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

79

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

80

81

// Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

82

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

83

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

84

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

85

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

86

87

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

88

// Compute the 2x1 or 1x2 output tile

89

// out00 = d00 + d01 + d02

90

// out01 = d01 - d02 - d03

91

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

92

float out00 = d00 + d01 + d02;

93

float out01 = d01 - d02 - d03;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

94

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

95

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

96

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

97

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

98

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

99

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

100

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

101

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

102

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

103

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

104

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

105

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

106

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

107

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

108

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

109

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

110

111

// Compute the 2x2 output tile

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

112

float k0 = d01 + d11 + d21;

113

float k1 = d02 + d12 + d22;

114

float k2 = d11 - d21 - d31;

115

float k3 = d12 - d22 - d32;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

116

117

// out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22

118

// out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)

119

// out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)

120

// out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)

121

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

float out00 = d10;

float out01 = -d13;

float out10 = d10;

float out11 = -d13;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

126

127

out00 += d00 + d20 + k0 + k1;

128

out01 += k0 - k1 - (d03 + d23);

129

out10 += -d20 - d30 + k2 + k3;

130

out11 += k2 - k3 + d23 + d33;

131

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

132

133

int y_in = get_global_id(1);

134

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

135

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

136

int z_out = get_global_id(0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

137

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

138

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

139

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

140

141

#if defined(HAS_BIAS)

142

// Add bias

143

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

144

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

145

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

146

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

147

out00 += (float)b;

148

out01 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

149

#endif // defined(HAS_BIAS)

150

151

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

152

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

153

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

154

#else /* defined(SRC_DEPTH) */

155

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

156

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

157

158

// Store the output tile

159

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

160

const const VEC_DATA_TYPE(DATA_TYPE, 2)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

161

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

162

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;

163

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

164

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

165

vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,

166

(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

167

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

168

169

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

170

#if defined(HAS_BIAS)

171

// Add bias

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

172

out10 += (DATA_TYPE)b;

173

out11 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

174

#endif // defined(HAS_BIAS)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

175

vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,

176

(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

177

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

178

}

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

179

180

#define COMPUTE_TMP_COL_2x2_7x7(col, d0, d1, d2, d3, d4, d5, d6, d7) \

181

({ \

182

col.s0 = d0 + d1 + d2 + d3 + d4 + d5 + d6; \

183

col.s1 = -d1 + d2 - 2 * d3 + 2 * d4 + -3 * d5 + 3 * d6 + d7; \

184

})

185

186

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC

187

*

188

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

189

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

190

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

191

* @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

192

* @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

193

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

194

*

195

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

196

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

197

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

198

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

199

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

200

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

201

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

202

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

203

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

204

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

205

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

206

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

207

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

208

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

209

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

210

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

211

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

212

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

213

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

214

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

215

*/

216

__kernel void winograd_output_transform_2x2_7x7_nhwc(

217

TENSOR4D_DECLARATION(src),

218

TENSOR4D_DECLARATION(dst),

219

#if defined(HAS_BIAS)

220

VECTOR_DECLARATION(bias),

221

#endif // defined(HAS_BIAS)

222

int dst_size)

223

{

224

// Each thread stores a 4x4/4x1 or 1x4 tile

225

#if defined(SRC_DEPTH)

226

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

227

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

228

#else /* defined(SRC_DEPTH) */

229

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

230

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

231

#endif /* defined(SRC_DEPTH) */

232

233

int y_in = get_global_id(1);

234

int x_out = get_global_id(0);

235

int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

236

int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

237

#if defined(SRC_DEPTH)

238

int batch = get_global_id(2) / SRC_DEPTH;

239

#endif /* defined(SRC_DEPTH) */

240

241

#if defined(SRC_DEPTH)

242

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

243

#else /* defined(SRC_DEPTH) */

244

245

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

246

#endif /* defined(SRC_DEPTH) */

247

248

// Load the values across the channels to compose the input tile

249

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

250

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

251

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

252

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

253

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

254

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

255

DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

256

DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

257

258

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

259

// Compute out00, out01, out02 and out03

260

float out00 = d00 + d01 + d02 + d03 + d04 + d05 + d06;

261

float out01 = -d01 + d02 - 2.f * d03 + 2.0f * d04 - 3.0f * d05 + 3.0f * d06 + d07;

262

263

#if defined(HAS_BIAS)

264

// Add bias

265

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

266

267

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

out00 += (float)b;

out01 += (float)b;

#endif // defined(HAS_BIAS)

272

273

// Store the output tile

274

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

275

// Get output address

276

#if defined(SRC_DEPTH)

277

int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

278

#else /* defined(SRC_DEPTH) */

279

int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

280

#endif /* defined(SRC_DEPTH) */

281

offset = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

282

283

VEC_DATA_TYPE(DATA_TYPE, 2)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

284

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

285

*(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;

286

*(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;

287

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

288

// Get output address

289

int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

290

VEC_DATA_TYPE(DATA_TYPE, 2)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

291

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

292

*(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;

293

*(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;

294

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

295

296

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

297

298

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

299

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

300

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

301

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

302

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

303

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

304

DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

305

DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

306

307

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

308

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

309

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

310

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

311

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

312

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

313

DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

314

DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

315

316

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

317

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

318

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

319

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

320

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

321

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

322

DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

323

DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

324

325

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

326

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

327

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

328

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

329

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));

330

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));

331

DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));

332

DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));

333

334

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));

335

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));

336

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));

337

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));

338

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));

339

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));

340

DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));

341

DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));

342

343

DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));

344

DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));

345

DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));

346

DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));

347

DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));

348

DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));

349

DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));

350

DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));

351

352

DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));

353

DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));

354

DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));

355

DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));

356

DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));

357

DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));

358

DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));

359

DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));

360

361

// Compute the 8x2 intermediate tensor

362

VEC_DATA_TYPE(float, 2)

363

tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;

364

365

COMPUTE_TMP_COL_2x2_7x7(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70);

366

COMPUTE_TMP_COL_2x2_7x7(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71);

367

COMPUTE_TMP_COL_2x2_7x7(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72);

368

COMPUTE_TMP_COL_2x2_7x7(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73);

369

COMPUTE_TMP_COL_2x2_7x7(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74);

370

COMPUTE_TMP_COL_2x2_7x7(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75);

371

COMPUTE_TMP_COL_2x2_7x7(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76);

372

COMPUTE_TMP_COL_2x2_7x7(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77);

373

374

// Compute the 2x2 output tile

375

VEC_DATA_TYPE(float, 2)

376

out_col0 = tmp_col0 + tmp_col1 + tmp_col2 + tmp_col3 + tmp_col4 + tmp_col5 + tmp_col6;

377

VEC_DATA_TYPE(float, 2)

378

out_col1 = -tmp_col1 + tmp_col2 - 2 * tmp_col3 + 2 * tmp_col4 - 3 * tmp_col5 + 3 * tmp_col6 + tmp_col7;

379

380

#if defined(HAS_BIAS)

381

// Add bias

382

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

383

384

DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

385

386

out_col0 += (VEC_DATA_TYPE(float, 2))b;

387

out_col1 += (VEC_DATA_TYPE(float, 2))b;

388

389

#endif // defined(HAS_BIAS)

390

// Get output address

391

#if defined(SRC_DEPTH)

392

int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

393

#else /* defined(SRC_DEPTH) */

394

int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

395

#endif /* defined(SRC_DEPTH) */

396

offset = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

397

int2 mult_y = min((int2)dst_size - offset, (int2)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.

398

399

// Store the output tile

400

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

401

out_col0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

402

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

403

out_col1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

404

405

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;

406

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;

407

408

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0_dt.s1;

409

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;

410

411

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

412

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

413

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

414

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

415

#if defined(VEC_SIZE) && VEC_SIZE == 4

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

416

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW

417

*

418

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

419

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

420

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

421

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

422

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

423

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

424

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

425

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

426

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

427

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

428

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

429

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

430

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

431

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

432

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

433

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

434

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

435

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

436

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

437

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

438

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

439

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

440

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

441

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

442

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

443

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

444

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

445

*/

446

__kernel void winograd_output_transform_4x4_3x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

447

TENSOR4D_DECLARATION(src),

448

TENSOR4D_DECLARATION(dst)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

449

#if defined(HAS_BIAS)

450

,

451

VECTOR_DECLARATION(bias)

452

#endif // defined(HAS_BIAS)

453

)

454

{

455

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

456

#if defined(SRC_DEPTH)

457

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

458

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

459

#else /* defined(SRC_DEPTH) */

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

460

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

461

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

462

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

463

464

// Load the values across the channels to compose the 6x6 or 6x1 tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

465

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

466

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

467

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

468

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

469

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

470

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

471

472

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

473

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

474

float out00 = d00 + d01 + d02 + d03 + d04;

475

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;

476

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;

477

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

478

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

479

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

480

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

481

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

482

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

483

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

484

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

485

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

486

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

487

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

488

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

489

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

490

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

491

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

492

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

493

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

494

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

495

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

496

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

497

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

498

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

499

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

500

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

501

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

502

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

503

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

504

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

505

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

506

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

507

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

508

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

509

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

510

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

511

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

512

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

513

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

514

515

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

516

float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;

517

float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;

518

float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;

519

float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

520

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

521

float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;

522

float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

523

524

out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;

525

out01 += k1 - d02 - d12 - d22 - d32 - d42;

526

out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;

527

out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;

528

529

// Compute out10, out11, out12 and out13

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

530

float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

531

float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

532

float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

533

float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

534

535

k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;

536

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;

537

538

out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;

539

out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;

540

out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;

541

out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;

542

543

// Compute out20, out21, out22 and out23

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

544

float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

545

float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

546

float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

547

float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

548

549

k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;

550

k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;

551

552

out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;

553

out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;

554

out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;

555

out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;

556

557

// Compute out30, out31, out32 and out33

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

558

float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

559

float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

560

float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

561

float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

562

563

k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;

564

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;

565

566

out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;

567

out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;

568

out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;

569

out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;

570

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

571

572

int y_in = get_global_id(1);

573

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

574

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

575

int z_out = get_global_id(0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

576

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

577

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

578

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

579

580

#if defined(HAS_BIAS)

581

// Add bias

582

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

583

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

584

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

585

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out00 += (float)b;

out01 += (float)b;

out02 += (float)b;

out03 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

590

#endif // defined(HAS_BIAS)

591

592

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

593

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

594

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

595

#else /* defined(SRC_DEPTH) */

596

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

597

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

598

599

// Store the output tile

600

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

601

VEC_DATA_TYPE(DATA_TYPE, 4)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

602

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

603

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;

604

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;

605

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;

606

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

607

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

608

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,

609

(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

610

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

611

612

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

613

#if defined(HAS_BIAS)

614

// Add bias

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out10 += (float)b;

out11 += (float)b;

out12 += (float)b;

out13 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

619

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out20 += (float)b;

out21 += (float)b;

out22 += (float)b;

out23 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

624

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out30 += (float)b;

out31 += (float)b;

out32 += (float)b;

out33 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

629

#endif // defined(HAS_BIAS)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

630

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,

631

(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

632

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,

633

(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));

634

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,

635

(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

636

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

637

}

638

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

639

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

640

*

641

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

642

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

643

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

644

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

645

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

646

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

647

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

648

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

649

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

650

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

651

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

652

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

653

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

654

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

655

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

656

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

657

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

658

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

659

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

660

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

661

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

662

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

663

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

664

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

665

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

666

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

667

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

668

* @param[in] dst_size Size of the destination tensor, minus the last padding

669

*/

670

__kernel void winograd_output_transform_4x4_3x3_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

671

TENSOR4D_DECLARATION(src),

672

TENSOR4D_DECLARATION(dst),

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

673

#if defined(HAS_BIAS)

674

VECTOR_DECLARATION(bias),

675

#endif // defined(HAS_BIAS)

676

int dst_size)

677

{

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

678

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

679

#if defined(SRC_DEPTH)

680

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

681

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

682

#else /* defined(SRC_DEPTH) */

683

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

684

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

685

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

686

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

687

// Load the values across the 36 channels to compose the 6x6 or 6x1 tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

688

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

689

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

690

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

691

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

692

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

693

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

694

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

695

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

696

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

697

float out00 = d00 + d01 + d02 + d03 + d04;

698

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;

699

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;

700

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

701

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

702

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

703

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

704

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

705

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

706

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

707

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

708

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

709

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

710

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

711

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

712

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

713

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

714

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

715

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

716

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

717

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

718

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

719

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

720

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

721

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

722

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

723

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

724

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

725

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

726

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

727

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

728

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

729

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

730

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

731

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

732

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

733

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

734

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

735

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

736

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

737

738

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

739

float out00 = d01 + d21 + d41 + d11 + d31;

740

float out01 = d01 + d21 + d41 + d11 + d31;

741

float out02 = d01 + d21 + d41 + d11 + d31;

742

float out03 = d01 + d21 + d41 + d11 + d31;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

743

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

744

float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;

745

float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

746

747

out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;

748

out01 += k1 - d02 - d12 - d22 - d32 - d42;

749

out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;

750

out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;

751

752

// Compute out10, out11, out12 and out13

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

753

float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

754

float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

755

float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

756

float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

757

758

k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;

759

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;

760

761

out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;

762

out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;

763

out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;

764

out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;

765

766

// Compute out20, out21, out22 and out23

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

767

float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

768

float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

769

float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

770

float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

771

772

k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;

773

k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;

774

775

out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;

776

out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;

777

out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;

778

out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;

779

780

// Compute out30, out31, out32 and out33

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

781

float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

782

float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

783

float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

784

float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

785

786

k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;

787

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;

788

789

out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;

790

out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;

791

out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;

792

out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

793

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

794

795

int y_in = get_global_id(1);

796

int x_out = get_global_id(0);

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

797

int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

798

int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

799

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

800

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

801

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

802

803

#if defined(HAS_BIAS)

804

// Add bias

805

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

806

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

807

DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

808

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

809

out00 += (DATA_TYPE)b;

810

out01 += (DATA_TYPE)b;

811

out02 += (DATA_TYPE)b;

812

out03 += (DATA_TYPE)b;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

813

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

814

out10 += (DATA_TYPE)b;

815

out11 += (DATA_TYPE)b;

816

out12 += (DATA_TYPE)b;

817

out13 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

818

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

819

out20 += (DATA_TYPE)b;

820

out21 += (DATA_TYPE)b;

821

out22 += (DATA_TYPE)b;

822

out23 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

823

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

824

out30 += (DATA_TYPE)b;

825

out31 += (DATA_TYPE)b;

826

out32 += (DATA_TYPE)b;

827

out33 += (DATA_TYPE)b;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

828

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

829

830

#endif // defined(HAS_BIAS)

831

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

832

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

833

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

834

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

835

#else /* defined(SRC_DEPTH) */

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

836

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

837

#endif /* defined(SRC_DEPTH) */

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

838

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

839

840

// Store the 1x4 output tile

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

841

VEC_DATA_TYPE(DATA_TYPE, 4)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

842

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

843

*((__global DATA_TYPE *)(dst_ptr + offset.s0)) = out0_dt.s0;

844

*((__global DATA_TYPE *)(dst_ptr + offset.s1)) = out0_dt.s1;

845

*((__global DATA_TYPE *)(dst_ptr + offset.s2)) = out0_dt.s2;

846

*((__global DATA_TYPE *)(dst_ptr + offset.s3)) = out0_dt.s3;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

847

#elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

848

// Store the 4x1 output tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

849

int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

Giorgio Arena

d02eb45

2018-07-18 11:45:30 +0100

[diff] [blame]

850

int mult_y = min(dst_size - offset, 1);

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

851

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

852

VEC_DATA_TYPE(DATA_TYPE, 4)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

853

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)),

854

A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

855

*((__global DATA_TYPE *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = out0_dt.s0;

856

*((__global DATA_TYPE *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = out0_dt.s1;

857

*((__global DATA_TYPE *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = out0_dt.s2;

858

*((__global DATA_TYPE *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = out0_dt.s3;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

859

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

860

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

861

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

862

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

863

#else /* defined(SRC_DEPTH) */

864

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

865

#endif /* defined(SRC_DEPTH) */

Georgios Pinitas

b0b3717

2018-07-20 17:30:56 +0100

[diff] [blame]

866

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

867

int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

868

869

// Store the 4x4 output tile

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

870

VEC_DATA_TYPE(DATA_TYPE, 4)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

871

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

872

VEC_DATA_TYPE(DATA_TYPE, 4)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

873

out1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

874

VEC_DATA_TYPE(DATA_TYPE, 4)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

875

out2_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

876

VEC_DATA_TYPE(DATA_TYPE, 4)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

877

out3_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33),

878

VEC_DATA_TYPE(DATA_TYPE, 4)),

879

A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

880

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out0_dt.s0;

881

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out0_dt.s1;

882

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = out0_dt.s2;

883

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = out0_dt.s3;

884

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = out1_dt.s0;

885

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = out1_dt.s1;

886

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = out1_dt.s2;

887

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = out1_dt.s3;

888

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = out2_dt.s0;

889

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = out2_dt.s1;

890

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = out2_dt.s2;

891

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = out2_dt.s3;

892

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = out3_dt.s0;

893

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out3_dt.s1;

894

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out3_dt.s2;

895

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out3_dt.s3;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

896

897

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

898

}

899

900

#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \

901

({ \

902

comm_fact.s0 = d1 + d2; \

903

comm_fact.s1 = d3 + d4; \

904

comm_fact.s2 = d5 + d6; \

905

\

906

col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \

907

col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \

908

\

909

comm_fact.s0 = d1 - d2; \

910

comm_fact.s1 = d3 - d4; \

911

comm_fact.s2 = d5 - d6; \

912

\

913

col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \

914

col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \

915

})

916

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

917

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

918

*

919

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

920

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

921

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

922

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

923

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

924

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

925

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

926

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

927

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

928

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

929

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

930

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

931

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

932

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

933

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

934

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

935

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

936

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

937

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

938

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

939

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

940

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

941

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

942

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

943

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

944

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

945

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

946

*/

947

__kernel void winograd_output_transform_4x4_5x5_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

948

TENSOR4D_DECLARATION(src),

949

TENSOR4D_DECLARATION(dst)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

950

#if defined(HAS_BIAS)

951

,

952

VECTOR_DECLARATION(bias)

953

#endif // defined(HAS_BIAS)

954

)

955

{

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

956

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

957

#if defined(SRC_DEPTH)

958

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

959

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

960

#else /* defined(SRC_DEPTH) */

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

961

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

962

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

963

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

964

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

965

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

966

// Compute output address

967

int y_in = get_global_id(1);

968

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

969

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

970

int z_out = get_global_id(0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

971

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

972

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

973

#endif /* defined(SRC_DEPTH) */

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

974

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

975

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

976

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

977

#else /* defined(SRC_DEPTH) */

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

978

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

979

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

980

#endif /* defined(SRC_DEPTH) */

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

981

982

// Load the values across the channels to compose the input tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

983

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

984

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

985

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

986

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

987

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

988

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

989

DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

990

DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

991

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

992

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

993

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

994

float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;

995

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;

996

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;

997

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

998

999

#if defined(HAS_BIAS)

1000

// Add bias

1001

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

1002

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1003

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1004

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1005

out00 += (DATA_TYPE)b;

1006

out01 += (DATA_TYPE)b;

1007

out02 += (DATA_TYPE)b;

1008

out03 += (DATA_TYPE)b;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1009

#endif // defined(HAS_BIAS)

1010

1011

// Store the output tile

1012

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1013

VEC_DATA_TYPE(DATA_TYPE, 4)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

1014

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1015

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;

1016

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;

1017

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;

1018

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1019

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

1020

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,

1021

(__global DATA_TYPE *)(dst_addr));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1022

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1023

1024

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1025

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1026

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

1027

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

1028

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

1029

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

1030

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

1031

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

1032

DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

1033

DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1034

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1035

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

1036

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

1037

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

1038

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

1039

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

1040

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

1041

DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

1042

DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1043

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1044

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

1045

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

1046

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

1047

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

1048

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

1049

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

1050

DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

1051

DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1052

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1053

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

1054

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

1055

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

1056

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

1057

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));

1058

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));

1059

DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));

1060

DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1061

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1062

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));

1063

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));

1064

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));

1065

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));

1066

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));

1067

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));

1068

DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));

1069

DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1070

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1071

DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));

1072

DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));

1073

DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));

1074

DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));

1075

DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));

1076

DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));

1077

DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));

1078

DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1079

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1080

DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));

1081

DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));

1082

DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));

1083

DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));

1084

DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));

1085

DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));

1086

DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));

1087

DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1088

1089

// Compute the 8x4 intermediate tensor

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1090

VEC_DATA_TYPE(float, 4)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1091

comm_fact0, comm_fact1, comm_fact2;

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1092

VEC_DATA_TYPE(float, 4)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1093

tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1094

1095

COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);

1096

COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);

1097

COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);

1098

COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);

1099

COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);

1100

COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);

1101

COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);

1102

COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);

1103

1104

// Compute the 4x4 output tile

1105

comm_fact0 = tmp_col1 + tmp_col2;

1106

comm_fact1 = tmp_col3 + tmp_col4;

1107

comm_fact2 = tmp_col5 + tmp_col6;

1108

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1109

VEC_DATA_TYPE(float, 4)

1110

out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0;

1111

VEC_DATA_TYPE(float, 4)

1112

out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1113

1114

comm_fact0 = tmp_col1 - tmp_col2;

1115

comm_fact1 = tmp_col3 - tmp_col4;

1116

comm_fact2 = tmp_col5 - tmp_col6;

1117

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1118

VEC_DATA_TYPE(float, 4)

1119

out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2;

1120

VEC_DATA_TYPE(float, 4)

1121

out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1122

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1123

#if defined(HAS_BIAS)

1124

// Add bias

1125

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

1126

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1127

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1128

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1129

out_col0 += (VEC_DATA_TYPE(float, 4))b;

1130

out_col1 += (VEC_DATA_TYPE(float, 4))b;

1131

out_col2 += (VEC_DATA_TYPE(float, 4))b;

1132

out_col3 += (VEC_DATA_TYPE(float, 4))b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1133

#endif // defined(HAS_BIAS)

1134

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1135

// Store the output tile

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

1136

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), A_VAL, B_VAL), 0,

1137

(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

1138

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), A_VAL, B_VAL), 0,

1139

(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

1140

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), A_VAL, B_VAL), 0,

1141

(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));

1142

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), A_VAL, B_VAL), 0,

1143

(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1144

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1145

}

1146

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1147

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1148

*

1149

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1150

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1151

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1152

* @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1153

* @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1154

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1155

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1156

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1157

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1158

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1159

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1160

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1161

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1162

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1163

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1164

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1165

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1166

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1167

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1168

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1169

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1170

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1171

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1172

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1173

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1174

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1175

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1176

*/

1177

__kernel void winograd_output_transform_4x4_5x5_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1178

TENSOR4D_DECLARATION(src),

1179

TENSOR4D_DECLARATION(dst),

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1180

#if defined(HAS_BIAS)

1181

VECTOR_DECLARATION(bias),

1182

#endif // defined(HAS_BIAS)

1183

int dst_size)

1184

{

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1185

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1186

#if defined(SRC_DEPTH)

1187

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1188

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1189

#else /* defined(SRC_DEPTH) */

1190

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

1191

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

1192

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1193

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1194

int y_in = get_global_id(1);

1195

int x_out = get_global_id(0);

1196

int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

1197

int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1198

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1199

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1200

#endif /* defined(SRC_DEPTH) */

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1201

1202

// Load the values across the channels to compose the input tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1203

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

1204

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

1205

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

1206

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

1207

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

1208

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

1209

DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

1210

DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1211

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1212

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1213

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1214

float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;

1215

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;

1216

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;

1217

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1218

1219

#if defined(HAS_BIAS)

1220

// Add bias

1221

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

1222

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1223

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1224

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out00 += (float)b;

out01 += (float)b;

out02 += (float)b;

out03 += (float)b;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1229

#endif // defined(HAS_BIAS)

1230

1231

// Store the output tile

1232

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1233

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1234

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1235

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1236

#else /* defined(SRC_DEPTH) */

1237

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

1238

#endif /* defined(SRC_DEPTH) */

1239

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1240

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1241

VEC_DATA_TYPE(DATA_TYPE, 4)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

1242

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1243

*(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;

1244

*(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;

1245

*(__global DATA_TYPE *)(dst_ptr + offset.s2) = out0_dt.s2;

1246

*(__global DATA_TYPE *)(dst_ptr + offset.s3) = out0_dt.s3;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1247

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1248

// Get output address

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1249

int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1250

VEC_DATA_TYPE(DATA_TYPE, 4)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

1251

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,

1252

B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1253

*(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;

1254

*(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;

1255

*(__global DATA_TYPE *)(dst_ptr + 2 * dst_stride_y + offset) = out0_dt.s2;

1256

*(__global DATA_TYPE *)(dst_ptr + 3 * dst_stride_y + offset) = out0_dt.s3;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1257

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1258

1259

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1260

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1261

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

1262

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

1263

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

1264

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

1265

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

1266

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

1267

DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

1268

DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1269

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1270

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

1271

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

1272

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

1273

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

1274

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

1275

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

1276

DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

1277

DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1278

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1279

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

1280

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

1281

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

1282

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

1283

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

1284

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

1285

DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

1286

DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1287

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1288

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

1289

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

1290

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

1291

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

1292

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));

1293

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));

1294

DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));

1295

DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1296

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1297

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));

1298

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));

1299

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));

1300

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));

1301

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));

1302

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));

1303

DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));

1304

DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1305

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1306

DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));

1307

DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));

1308

DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));

1309

DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));

1310

DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));

1311

DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));

1312

DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));

1313

DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1314

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1315

DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));

1316

DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));

1317

DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));

1318

DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));

1319

DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));

1320

DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));

1321

DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));

1322

DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1323

1324

// Compute the 8x4 intermediate tensor

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1325

VEC_DATA_TYPE(float, 4)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1326

comm_fact0, comm_fact1, comm_fact2;

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1327

VEC_DATA_TYPE(float, 4)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1328

tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1329

1330

COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);

1331

COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);

1332

COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);

1333

COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);

1334

COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);

1335

COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);

1336

COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);

1337

COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);

1338

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1339

// Compute the output tile

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1340

comm_fact0 = tmp_col1 + tmp_col2;

1341

comm_fact1 = tmp_col3 + tmp_col4;

1342

comm_fact2 = tmp_col5 + tmp_col6;

1343

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1344

VEC_DATA_TYPE(float, 4)

1345

out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;

1346

VEC_DATA_TYPE(float, 4)

1347

out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1348

1349

comm_fact0 = tmp_col1 - tmp_col2;

1350

comm_fact1 = tmp_col3 - tmp_col4;

1351

comm_fact2 = tmp_col5 - tmp_col6;

1352

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1353

VEC_DATA_TYPE(float, 4)

1354

out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;

1355

VEC_DATA_TYPE(float, 4)

1356

out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1357

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1358

#if defined(HAS_BIAS)

1359

// Add bias

1360

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

1361

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1362

DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1363

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1364

out_col0 += (VEC_DATA_TYPE(float, 4))b;

1365

out_col1 += (VEC_DATA_TYPE(float, 4))b;

1366

out_col2 += (VEC_DATA_TYPE(float, 4))b;

1367

out_col3 += (VEC_DATA_TYPE(float, 4))b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1368

#endif // defined(HAS_BIAS)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1369

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1370

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1371

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1372

#else /* defined(SRC_DEPTH) */

1373

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

1374

#endif /* defined(SRC_DEPTH) */

Georgios Pinitas

b0b3717

2018-07-20 17:30:56 +0100

[diff] [blame]

1375

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

1376

int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1377

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1378

// Store the output tile

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1379

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

1380

out_col0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1381

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

1382

out_col1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1383

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

1384

out_col2_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col2, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1385

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

1386

out_col3_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col3, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1387

1388

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;

1389

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;

1390

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * (int)dst_stride_y + offset.s0) = out_col2_dt.s0;

1391

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * (int)dst_stride_y + offset.s0) = out_col3_dt.s0;

1392

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0_dt.s1;

1393

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;

1394

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * (int)dst_stride_y + offset.s1) = out_col2_dt.s1;

1395

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * (int)dst_stride_y + offset.s1) = out_col3_dt.s1;

1396

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * (int)dst_stride_y + offset.s2) = out_col0_dt.s2;

1397

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * (int)dst_stride_y + offset.s2) = out_col1_dt.s2;

1398

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * (int)dst_stride_y + offset.s2) = out_col2_dt.s2;

1399

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * (int)dst_stride_y + offset.s2) = out_col3_dt.s2;

1400

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * (int)dst_stride_y + offset.s3) = out_col0_dt.s3;

1401

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * (int)dst_stride_y + offset.s3) = out_col1_dt.s3;

1402

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * (int)dst_stride_y + offset.s3) = out_col2_dt.s3;

1403

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * (int)dst_stride_y + offset.s3) = out_col3_dt.s3;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1404

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1405

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1406

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1407

1408

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1409

#if defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1410

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW

1411

*

1412

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1413

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

1414

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1415

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1416

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1417

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1418

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1419

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1420

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1421

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1422

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1423

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1424

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1425

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1426

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1427

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1428

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1429

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1430

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1431

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1432

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1433

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1434

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1435

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1436

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1437

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1438

*/

1439

__kernel void winograd_output_transform_2x1_3x1_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1440

TENSOR4D_DECLARATION(src),

1441

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1442

#if defined(HAS_BIAS)

1443

,

1444

VECTOR_DECLARATION(bias)

1445

#endif // defined(HAS_BIAS)

1446

)

1447

{

1448

winograd_output_transform_2x2_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1455

src_stride_w,

1456

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1457

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1465

dst_stride_w,

1466

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1467

dst_offset_first_element_in_bytes

1468

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1474

#endif // defined(HAS_BIAS)

1475

);

1476

}

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

1477

1478

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC

1479

*

1480

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1481

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

1482

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1483

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1484

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

1485

*

1486

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

1487

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1488

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1489

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1490

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1491

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1492

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1493

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1494

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1495

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1496

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1497

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1498

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1499

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1500

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1501

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1502

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1503

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1504

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1505

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1506

*/

1507

__kernel void winograd_output_transform_2x1_7x1_nhwc(

1508

TENSOR4D_DECLARATION(src),

1509

TENSOR4D_DECLARATION(dst),

1510

#if defined(HAS_BIAS)

1511

VECTOR_DECLARATION(bias),

1512

#endif // defined(HAS_BIAS)

1513

int dst_size)

1514

{

1515

winograd_output_transform_2x2_7x7_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

1535

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1540

#endif // defined(HAS_BIAS)

1541

dst_size);

1542

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1543

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1544

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1545

#if defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1546

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW

1547

*

1548

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1549

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1550

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1551

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1552

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1553

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1554

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1555

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1556

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1557

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1558

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1559

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1560

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1561

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1562

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1563

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1564

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1565

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1566

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1567

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1568

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1569

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1570

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1571

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1572

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1573

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1574

*/

1575

__kernel void winograd_output_transform_4x1_3x1_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1576

TENSOR4D_DECLARATION(src),

1577

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1578

#if defined(HAS_BIAS)

1579

,

1580

VECTOR_DECLARATION(bias)

1581

#endif // defined(HAS_BIAS)

1582

)

1583

{

1584

winograd_output_transform_4x4_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1591

src_stride_w,

1592

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1593

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1601

dst_stride_w,

1602

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1603

dst_offset_first_element_in_bytes

1604

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1610

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW

1615

*

1616

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1617

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1618

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1619

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1620

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1621

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1622

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1623

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1624

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1625

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1626

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1627

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1628

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1629

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1630

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1631

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1632

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1633

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1634

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1635

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1636

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1637

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1638

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1639

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1640

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1641

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1642

*/

1643

__kernel void winograd_output_transform_4x1_5x1_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1644

TENSOR4D_DECLARATION(src),

1645

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1646

#if defined(HAS_BIAS)

1647

,

1648

VECTOR_DECLARATION(bias)

1649

#endif // defined(HAS_BIAS)

1650

)

1651

{

1652

winograd_output_transform_4x4_5x5_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1659

src_stride_w,

1660

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1661

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1669

dst_stride_w,

1670

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1671

dst_offset_first_element_in_bytes

1672

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1678

#endif // defined(HAS_BIAS)

1679

);

1680

}

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1681

1682

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC

1683

*

1684

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1685

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1686

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1687

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1688

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1689

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1690

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1691

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1692

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1693

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1694

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1695

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1696

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1697

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1698

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1699

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1700

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1701

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1702

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1703

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1704

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1705

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1706

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1707

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1708

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1709

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1710

*/

1711

__kernel void winograd_output_transform_4x1_3x1_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1712

TENSOR4D_DECLARATION(src),

1713

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1714

#if defined(HAS_BIAS)

1715

VECTOR_DECLARATION(bias),

1716

#endif // defined(HAS_BIAS)

1717

int dst_size)

1718

{

1719

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1726

src_stride_w,

1727

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1728

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1736

dst_stride_w,

1737

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1738

dst_offset_first_element_in_bytes,

1739

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1744

#endif // defined(HAS_BIAS)

dst_size);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC

1749

*

1750

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1751

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1752

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1753

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1754

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1755

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1756

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1757

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1758

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1759

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1760

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1761

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1762

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1763

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1764

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1765

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1766

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1767

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1768

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1769

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1770

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1771

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1772

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1773

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1774

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1775

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1776

*/

1777

__kernel void winograd_output_transform_4x1_5x1_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1778

TENSOR4D_DECLARATION(src),

1779

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1780

#if defined(HAS_BIAS)

1781

VECTOR_DECLARATION(bias),

1782

#endif // defined(HAS_BIAS)

1783

int dst_size)

1784

{

1785

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1792

src_stride_w,

1793

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1794

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1802

dst_stride_w,

1803

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1804

dst_offset_first_element_in_bytes,

1805

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1810

#endif // defined(HAS_BIAS)

1811

dst_size);

1812

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1813

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1814

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

1815

1816

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1817

#if defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1818

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW

1819

*

1820

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1821

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1822

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

1823

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1824

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1825

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1826

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1827

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1828

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1829

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1830

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1831

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1832

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1833

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1834

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1835

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1836

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1837

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1838

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1839

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1840

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1841

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1842

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1843

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1844

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1845

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1846

*/

1847

__kernel void winograd_output_transform_1x2_1x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1848

TENSOR4D_DECLARATION(src),

1849

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1850

#if defined(HAS_BIAS)

1851

,

1852

VECTOR_DECLARATION(bias)

1853

#endif // defined(HAS_BIAS)

1854

)

1855

{

1856

winograd_output_transform_2x2_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1863

src_stride_w,

1864

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1865

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1873

dst_stride_w,

1874

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1875

dst_offset_first_element_in_bytes

1876

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1882

#endif // defined(HAS_BIAS)

1883

);

1884

}

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

1885

1886

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC

1887

*

1888

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1889

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1890

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

1891

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

1892

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

1893

*

1894

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

1895

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1896

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1897

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1898

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1899

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1900

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1901

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1902

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1903

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1904

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1905

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1906

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1907

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1908

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1909

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1910

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1911

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1912

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1913

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1914

*/

1915

__kernel void winograd_output_transform_1x2_1x7_nhwc(

1916

TENSOR4D_DECLARATION(src),

1917

TENSOR4D_DECLARATION(dst),

1918

#if defined(HAS_BIAS)

1919

VECTOR_DECLARATION(bias),

1920

#endif // defined(HAS_BIAS)

1921

int dst_size)

1922

{

1923

winograd_output_transform_2x2_7x7_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

1943

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1948

#endif // defined(HAS_BIAS)

1949

dst_size);

1950

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1951

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1952

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1953

#if defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1954

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW

1955

*

1956

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1957

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1958

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1959

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1960

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1961

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1962

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1963

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1964

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1965

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1966

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1967

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1968

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1969

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1970

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1971

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1972

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1973

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1974

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1975

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1976

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1977

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1978

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1979

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1980

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1981

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1982

*/

1983

__kernel void winograd_output_transform_1x4_1x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1984

TENSOR4D_DECLARATION(src),

1985

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1986

#if defined(HAS_BIAS)

1987

,

1988

VECTOR_DECLARATION(bias)

1989

#endif // defined(HAS_BIAS)

1990

)

1991

{

1992

winograd_output_transform_4x4_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1999

src_stride_w,

2000

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2001

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2009

dst_stride_w,

2010

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2011

dst_offset_first_element_in_bytes

2012

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

2018

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW

2023

*

2024

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

2025

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

2026

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

2027

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2028

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2029

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2030

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2031

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2032

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2033

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2034

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2035

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2036

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2037

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2038

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2039

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

2040

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

2041

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2042

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2043

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2044

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2045

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2046

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

2047

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

2048

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2049

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2050

*/

2051

__kernel void winograd_output_transform_1x4_1x5_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2052

TENSOR4D_DECLARATION(src),

2053

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2054

#if defined(HAS_BIAS)

2055

,

2056

VECTOR_DECLARATION(bias)

2057

#endif // defined(HAS_BIAS)

2058

)

2059

{

2060

winograd_output_transform_4x4_5x5_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2067

src_stride_w,

2068

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2069

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2077

dst_stride_w,

2078

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2079

dst_offset_first_element_in_bytes

2080

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

2086

#endif // defined(HAS_BIAS)

2087

);

2088

}

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2089

2090

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC

2091

*

2092

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

2093

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

2094

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

2095

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2096

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2097

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2098

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2099

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2100

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2101

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2102

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2103

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2104

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2105

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2106

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2107

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

2108

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

2109

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2110

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2111

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2112

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2113

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2114

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

2115

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

2116

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2117

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2118

*/

2119

__kernel void winograd_output_transform_1x4_1x3_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2120

TENSOR4D_DECLARATION(src),

2121

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2122

#if defined(HAS_BIAS)

2123

VECTOR_DECLARATION(bias),

2124

#endif // defined(HAS_BIAS)

2125

int dst_size)

2126

{

2127

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2134

src_stride_w,

2135

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2136

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2144

dst_stride_w,

2145

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2146

dst_offset_first_element_in_bytes,

2147

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

2152

#endif // defined(HAS_BIAS)

dst_size);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC

2157

*

2158

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

2159

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

2160

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

2161

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2162

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2163

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2164

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2165

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2166

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2167

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2168

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2169

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2170

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2171

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2172

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2173

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

2174

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

2175

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2176

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2177

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2178

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2179

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2180

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

2181

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

2182

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2183

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2184

*/

2185

__kernel void winograd_output_transform_1x4_1x5_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2186

TENSOR4D_DECLARATION(src),

2187

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2188

#if defined(HAS_BIAS)

2189

VECTOR_DECLARATION(bias),

2190

#endif // defined(HAS_BIAS)

2191

int dst_size)

2192

{

2193

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2200

src_stride_w,

2201

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2202

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2210

dst_stride_w,

2211

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2212

dst_offset_first_element_in_bytes,

2213

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

2218

#endif // defined(HAS_BIAS)

2219

dst_size);

2220

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

2221

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2222

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Georgios Pinitas