Blame - src/core/CL/cl_kernels/winograd_output_transform.cl - ml/ComputeLibrary

2018-08-31 16:10:16 +0100

[diff] [blame]

37

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

38

* @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

39

* @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.

40

* @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

41

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

42

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

43

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

44

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

45

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

46

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

47

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

48

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

49

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

50

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

51

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

52

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

53

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

54

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

55

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

56

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

57

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

58

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

59

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

60

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

61

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

62

*/

63

__kernel void winograd_output_transform_2x2_3x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

64

TENSOR4D_DECLARATION(src),

65

TENSOR4D_DECLARATION(dst)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

66

#if defined(HAS_BIAS)

67

,

68

VECTOR_DECLARATION(bias)

69

#endif // defined(HAS_BIAS)

70

)

71

{

72

// Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

73

#if defined(SRC_DEPTH)

74

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

75

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

76

#else /* defined(SRC_DEPTH) */

77

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

78

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

79

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

80

81

// Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

82

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

83

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

84

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

85

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

86

87

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

88

// Compute the 2x1 or 1x2 output tile

89

// out00 = d00 + d01 + d02

90

// out01 = d01 - d02 - d03

91

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

92

float out00 = d00 + d01 + d02;

93

float out01 = d01 - d02 - d03;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

94

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

95

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

96

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

97

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

98

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

99

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

100

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

101

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

102

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

103

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

104

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

105

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

106

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

107

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

108

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

109

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

110

111

// Compute the 2x2 output tile

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

112

float k0 = d01 + d11 + d21;

113

float k1 = d02 + d12 + d22;

114

float k2 = d11 - d21 - d31;

115

float k3 = d12 - d22 - d32;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

116

117

// out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22

118

// out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)

119

// out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)

120

// out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)

121

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

float out00 = d10;

float out01 = -d13;

float out10 = d10;

float out11 = -d13;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

126

127

out00 += d00 + d20 + k0 + k1;

128

out01 += k0 - k1 - (d03 + d23);

129

out10 += -d20 - d30 + k2 + k3;

130

out11 += k2 - k3 + d23 + d33;

131

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

132

133

int y_in = get_global_id(1);

134

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

135

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

136

int z_out = get_global_id(0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

137

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

138

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

139

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

140

141

#if defined(HAS_BIAS)

142

// Add bias

143

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

144

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

145

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

146

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

147

out00 += (float)b;

148

out01 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

149

#endif // defined(HAS_BIAS)

150

151

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

152

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

153

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

154

#else /* defined(SRC_DEPTH) */

155

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

156

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

157

158

// Store the output tile

159

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Michalis Spyrou

ef6ec50

2020-07-31 11:38:36 +0100

[diff] [blame]

160

const VEC_DATA_TYPE(DATA_TYPE, 2)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

161

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

162

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;

163

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

164

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

165

vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

166

(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

167

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

168

169

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

170

#if defined(HAS_BIAS)

171

// Add bias

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

172

out10 += (DATA_TYPE)b;

173

out11 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

174

#endif // defined(HAS_BIAS)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

175

vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

176

(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

177

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

178

}

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

179

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

180

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC

181

*

182

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

183

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

184

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

185

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

186

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

187

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

188

* @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

189

* @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

190

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

191

* @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

192

*

193

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

194

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

195

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

196

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

197

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

198

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

199

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

200

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

201

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

202

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

203

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

204

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

205

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

206

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

207

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

208

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

209

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

210

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

211

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

212

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

213

*/

214

__kernel void winograd_output_transform_2x2_7x7_nhwc(

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

215

TENSOR4D(src, BUFFER),

216

TENSOR4D(dst, BUFFER),

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

217

#if defined(HAS_BIAS)

218

VECTOR_DECLARATION(bias),

219

#endif // defined(HAS_BIAS)

220

int dst_size)

221

{

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

222

#define _ISRC_HEIGHT SRC_HEIGHT

223

#define _IDST_WIDTH DST_WIDTH

224

#define _IDST_HEIGHT DST_HEIGHT

225

#define _INUM_TILES_X NUM_TILES_X

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

226

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

227

const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM

228

const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES

229

const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

230

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

231

int x_out = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;

232

int y_out = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

233

234

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

235

TILE(DATA_TYPE, 8, N0, in);

236

TILE(DATA_TYPE, 2, N0, out);

237

TILE(uint, 8, 1, src_indirect_y);

238

239

// Calculate the indirect Y for the source tensor

240

LOOP_UNROLLING(int, i, 0, 8, 1)

241

{

242

src_indirect_y[i].v = mout + i * _ISRC_HEIGHT;

243

src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8);

244

}

245

246

// Load the values across the 8 channels to compose the 8x1 tile

247

T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

248

249

// Compute out0 and out01

250

out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v + in[5].v + in[6].v;

251

out[1].v = -in[1].v + in[2].v - 2.f * in[3].v + 2.0f * in[4].v - 3.0f * in[5].v + 3.0f * in[6].v + in[7].v;

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

252

253

#if defined(HAS_BIAS)

254

// Add bias

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

255

TILE(DATA_TYPE, 1, N0, b);

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

256

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

257

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 0, b);

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

258

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

259

T_ADD_BROADCAST_X(DATA_TYPE, 2, N0, out, b, out);

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

260

#endif // defined(HAS_BIAS)

261

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

262

T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

263

264

TILE(uint, 2, 1, dst_indirect_y);

265

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

266

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

267

LOOP_UNROLLING(int, yk, 0, 2, 1)

268

{

269

int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1));

270

dst_indirect_y[yk].v = x_out + y_c * (int)(_IDST_WIDTH);

271

}

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

272

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

273

LOOP_UNROLLING(int, xk, 0, 2, 1)

274

{

275

int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1));

276

dst_indirect_y[xk].v = x_c + y_out * (int)(_IDST_WIDTH);

277

}

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

278

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

279

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

280

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

281

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 2, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

282

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

283

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

284

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

285

TILE(DATA_TYPE, 64, N0, in);

286

TILE(DATA_TYPE, 4, N0, out);

287

TILE(DATA_TYPE, 16, N0, tmp);

288

TILE(uint, 64, 1, src_indirect_y);

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

289

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

290

// Calculate the indirect Y for the source tensor

291

LOOP_UNROLLING(int, i, 0, 64, 1)

292

{

293

src_indirect_y[i].v = mout + i * _ISRC_HEIGHT;

294

src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64);

295

}

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

296

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

297

// Load the values across the 64 channels to compose the 8x8 tile

298

T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

299

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

300

LOOP_UNROLLING(int, i, 0, 8, 1)

301

{

302

tmp[i * 2].v = in[0 + i].v + in[8 + i].v + in[16 + i].v + in[24 + i].v + in[32 + i].v + in[40 + i].v + in[48 + i].v;

303

tmp[i * 2 + 1].v = -in[8 + i].v + in[16 + i].v - 2 * in[24 + i].v + 2 * in[32 + i].v + -3 * in[40 + i].v + 3 * in[48 + i].v + in[56 + i].v;

304

}

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

305

306

// Compute the 2x2 output tile

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

307

LOOP_UNROLLING(int, i, 0, 2, 1)

308

{

309

out[i * 2].v = tmp[0 + i].v + tmp[2 + i].v + tmp[4 + i].v + tmp[6 + i].v + tmp[8 + i].v + tmp[10 + i].v + tmp[12 + i].v;

310

out[i * 2 + 1].v = -tmp[2 + i].v + tmp[4 + i].v - 2 * tmp[6 + i].v + 2 * tmp[8 + i].v - 3 * tmp[10 + i].v + 3 * tmp[12 + i].v + tmp[14 + i].v;

311

}

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

312

313

#if defined(HAS_BIAS)

314

// Add bias

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

315

TILE(DATA_TYPE, 1, N0, b);

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

316

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

317

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 0, b);

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

318

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

319

T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

320

#endif // defined(HAS_BIAS)

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

321

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

322

T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

323

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

324

TILE(uint, 4, 1, dst_indirect_y);

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

325

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

326

// Calculate the destination indirect Y

327

LOOP_UNROLLING(int, yk, 0, 2, 1)

328

{

329

LOOP_UNROLLING(int, xk, 0, 2, 1)

330

{

331

int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1));

332

int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1));

333

dst_indirect_y[xk + yk * 2].v = x_c + y_c * _IDST_WIDTH;

334

dst_indirect_y[xk + yk * 2].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);

335

}

336

}

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

337

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

338

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

339

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

340

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

341

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

342

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

343

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

344

#if defined(VEC_SIZE) && VEC_SIZE == 4

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

345

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW

346

*

347

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

348

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

349

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

350

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

351

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

352

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

353

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

354

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

355

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

356

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

357

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

358

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

359

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

360

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

361

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

362

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

363

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

364

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

365

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

366

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

367

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

368

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

369

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

370

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

371

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

372

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

373

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

374

*/

375

__kernel void winograd_output_transform_4x4_3x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

376

TENSOR4D_DECLARATION(src),

377

TENSOR4D_DECLARATION(dst)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

378

#if defined(HAS_BIAS)

379

,

380

VECTOR_DECLARATION(bias)

381

#endif // defined(HAS_BIAS)

382

)

383

{

384

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

385

#if defined(SRC_DEPTH)

386

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

387

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

388

#else /* defined(SRC_DEPTH) */

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

389

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

390

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

391

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

392

393

// Load the values across the channels to compose the 6x6 or 6x1 tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

394

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

395

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

396

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

397

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

398

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

399

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

400

401

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

402

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

403

float out00 = d00 + d01 + d02 + d03 + d04;

404

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;

405

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;

406

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

407

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

408

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

409

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

410

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

411

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

412

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

413

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

414

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

415

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

416

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

417

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

418

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

419

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

420

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

421

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

422

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

423

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

424

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

425

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

426

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

427

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

428

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

429

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

430

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

431

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

432

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

433

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

434

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

435

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

436

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

437

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

438

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

439

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

440

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

441

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

442

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

443

444

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

445

float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;

446

float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;

447

float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;

448

float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

449

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

450

float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;

451

float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

452

453

out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;

454

out01 += k1 - d02 - d12 - d22 - d32 - d42;

455

out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;

456

out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;

457

458

// Compute out10, out11, out12 and out13

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

459

float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

460

float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

461

float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

462

float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

463

464

k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;

465

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;

466

467

out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;

468

out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;

469

out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;

470

out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;

471

472

// Compute out20, out21, out22 and out23

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

473

float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

474

float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

475

float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

476

float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

477

478

k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;

479

k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;

480

481

out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;

482

out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;

483

out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;

484

out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;

485

486

// Compute out30, out31, out32 and out33

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

487

float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

488

float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

489

float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

490

float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

491

492

k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;

493

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;

494

495

out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;

496

out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;

497

out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;

498

out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;

499

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

500

501

int y_in = get_global_id(1);

502

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

503

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

504

int z_out = get_global_id(0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

505

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

506

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

507

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

508

509

#if defined(HAS_BIAS)

510

// Add bias

511

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

512

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

513

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

514

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out00 += (float)b;

out01 += (float)b;

out02 += (float)b;

out03 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

519

#endif // defined(HAS_BIAS)

520

521

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

522

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

523

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

524

#else /* defined(SRC_DEPTH) */

525

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

526

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

527

528

// Store the output tile

529

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

530

VEC_DATA_TYPE(DATA_TYPE, 4)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

531

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,

532

B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

533

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;

534

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;

535

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;

536

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

537

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

538

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

539

(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

540

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

541

542

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

543

#if defined(HAS_BIAS)

544

// Add bias

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out10 += (float)b;

out11 += (float)b;

out12 += (float)b;

out13 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

549

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out20 += (float)b;

out21 += (float)b;

out22 += (float)b;

out23 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

554

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out30 += (float)b;

out31 += (float)b;

out32 += (float)b;

out33 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

559

#endif // defined(HAS_BIAS)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

560

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

561

(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

562

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

563

(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

564

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

565

(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

566

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

567

}

568

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

569

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

570

*

571

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

572

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

573

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

574

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

575

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

576

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

577

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

578

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

579

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

580

* @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

581

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

582

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

583

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

584

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

585

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

586

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

587

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

588

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

589

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

590

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

591

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

592

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

593

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

594

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

595

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

596

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

597

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

598

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

599

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

600

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

601

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

602

* @param[in] dst_size Size of the destination tensor, minus the last padding

603

*/

604

__kernel void winograd_output_transform_4x4_3x3_nhwc(

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

605

TENSOR4D(src, BUFFER),

606

TENSOR4D(dst, BUFFER),

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

607

#if defined(HAS_BIAS)

608

VECTOR_DECLARATION(bias),

609

#endif // defined(HAS_BIAS)

610

int dst_size)

611

{

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

612

const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM

613

const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES

614

const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

615

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

616

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

617

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

618

TILE(DATA_TYPE, 6, N0, in) = { { { 0 } } };

619

TILE(DATA_TYPE, 4, N0, out) = { { { 0 } } };

620

TILE(uint, 6, 1, src_indirect_y) = { { { 0 } } };

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

621

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

622

LOOP_UNROLLING(int, i, 0, 6, 1)

623

{

624

src_indirect_y[i].v = mout + i * SRC_HEIGHT;

625

src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6);

626

}

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

627

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

628

// Load the values across the 36 channels to compose the 6x6 or 6x1 tile

629

T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

630

631

// Compute out00, out01, out02 and out03

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

632

out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v;

633

out[1].v = in[1].v - in[2].v + 2.0f * in[3].v - 2.0f * in[4].v;

634

out[2].v = in[1].v + in[2].v + 4.0f * in[3].v + 4.0f * in[4].v;

635

out[3].v = in[1].v - in[2].v + 8.0f * in[3].v - 8.0f * in[4].v + in[5].v;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

636

637

#if defined(HAS_BIAS)

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

638

TILE(DATA_TYPE, 1, N0, b);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

639

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame^]

640

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

641

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

642

// c = c + bias[broadcasted]

643

T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);

644

#endif // HAS_BIAS

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

645

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

646

int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;

647

int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

648

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

649

T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

650

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

651

TILE(uint, 4, 1, dst_indirect_y) = { { { 0 } } };

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

652

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

653

// Calculate the destination indirect Y

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

654

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

655

LOOP_UNROLLING(int, yk, 0, 4, 1)

656

{

657

int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));

658

dst_indirect_y[yk].v = x_out + y_c * DST_WIDTH;

659

dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

660

}

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

661

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

662

LOOP_UNROLLING(int, xk, 0, 4, 1)

663

{

664

int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));

665

dst_indirect_y[xk].v = x_c + y_out * DST_WIDTH;

666

dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

667

}

668

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

669

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

670

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

671

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

672

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

673

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

674

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

675

// Calculate the indirect Y for the source tensor

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

676

TILE(DATA_TYPE, 36, N0, in) = { { { 0 } } };

677

TILE(DATA_TYPE, 4, N0, tmp) = { { { 0 } } };

678

TILE(uint, 36, 1, src_indirect_y) = { { { 0 } } };

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

679

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

680

LOOP_UNROLLING(int, i, 0, 36, 1)

681

{

682

src_indirect_y[i].v = mout + i * SRC_HEIGHT;

683

src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36);

684

}

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

685

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

686

// Load the values across the 36 channels to compose the 6x6 or 6x1 tile

687

T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

688

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

689

LOOP_UNROLLING(int, i, 0, 6, 1)

690

{

691

tmp[0].v = in[6 + i].v + in[12 + i].v;

692

tmp[1].v = in[6 + i].v - in[12 + i].v;

693

tmp[2].v = in[18 + i].v + in[24 + i].v;

694

tmp[3].v = in[18 + i].v - in[24 + i].v;

695

tmp[3].v = tmp[3].v + tmp[3].v;

696

in[i].v = in[i].v + tmp[0].v + tmp[2].v;

697

in[6 + i].v = tmp[3].v + tmp[1].v;

698

in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);

699

in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v;

700

}

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

701

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

702

// Compute the output tile

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

703

TILE(DATA_TYPE, 16, N0, out) = { { { 0 } } };

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

704

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

705

LOOP_UNROLLING(int, i, 0, 4, 1)

706

{

707

tmp[0].v = in[6 * i + 1].v + in[6 * i + 2].v;

708

tmp[1].v = in[6 * i + 1].v - in[6 * i + 2].v;

709

tmp[2].v = in[6 * i + 3].v + in[6 * i + 4].v;

710

tmp[3].v = in[6 * i + 3].v - in[6 * i + 4].v;

711

tmp[3].v = tmp[3].v + tmp[3].v;

712

out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v;

713

out[4 * i + 1].v = tmp[3].v + tmp[1].v;

714

out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);

715

out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v;

716

}

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

717

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

718

#if defined(HAS_BIAS)

719

TILE(DATA_TYPE, 1, N0, b);

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

720

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame^]

721

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

722

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

723

// c = c + bias[broadcasted]

724

T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);

725

#endif // HAS_BIAS

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

726

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

727

int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;

728

int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

729

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

730

T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

731

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

732

TILE(uint, 16, 1, dst_indirect_y) = { { { 0 } } };

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

733

734

// Calculate the destination indirect Y

735

LOOP_UNROLLING(int, yk, 0, 4, 1)

736

{

737

LOOP_UNROLLING(int, xk, 0, 4, 1)

738

{

739

int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));

740

int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));

741

dst_indirect_y[xk + yk * 4].v = x_c + y_c * DST_WIDTH;

742

dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

}

}

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

747

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

748

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

749

}

750

751

#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \

752

({ \

753

comm_fact.s0 = d1 + d2; \

754

comm_fact.s1 = d3 + d4; \

755

comm_fact.s2 = d5 + d6; \

756

\

757

col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \

758

col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \

759

\

760

comm_fact.s0 = d1 - d2; \

761

comm_fact.s1 = d3 - d4; \

762

comm_fact.s2 = d5 - d6; \

763

\

764

col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \

765

col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \

766

})

767

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

768

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

769

*

770

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

771

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

772

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

773

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

774

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

775

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

776

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

777

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

778

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

779

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

780

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

781

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

782

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

783

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

784

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

785

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

786

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

787

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

788

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

789

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

790

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

791

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

792

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

793

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

794

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

795

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

796

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

797

*/

798

__kernel void winograd_output_transform_4x4_5x5_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

799

TENSOR4D_DECLARATION(src),

800

TENSOR4D_DECLARATION(dst)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

801

#if defined(HAS_BIAS)

802

,

803

VECTOR_DECLARATION(bias)

804

#endif // defined(HAS_BIAS)

805

)

806

{

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

807

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

808

#if defined(SRC_DEPTH)

809

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

810

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

811

#else /* defined(SRC_DEPTH) */

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

812

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

813

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

814

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

815

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

816

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

817

// Compute output address

818

int y_in = get_global_id(1);

819

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

820

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

821

int z_out = get_global_id(0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

822

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

823

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

824

#endif /* defined(SRC_DEPTH) */

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

825

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

826

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

827

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

828

#else /* defined(SRC_DEPTH) */

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

829

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

830

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

831

#endif /* defined(SRC_DEPTH) */

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

832

833

// Load the values across the channels to compose the input tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

834

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

835

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

836

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

837

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

838

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

839

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

840

DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

841

DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

842

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

843

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

844

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

845

float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;

846

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;

847

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;

848

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

849

850

#if defined(HAS_BIAS)

851

// Add bias

852

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

853

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

854

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

855

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

856

out00 += (DATA_TYPE)b;

857

out01 += (DATA_TYPE)b;

858

out02 += (DATA_TYPE)b;

859

out03 += (DATA_TYPE)b;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

860

#endif // defined(HAS_BIAS)

861

862

// Store the output tile

863

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

864

VEC_DATA_TYPE(DATA_TYPE, 4)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

865

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,

866

B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

867

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;

868

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;

869

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;

870

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

871

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

872

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

873

(__global DATA_TYPE *)(dst_addr));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

874

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

875

876

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

877

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

878

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

879

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

880

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

881

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

882

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

883

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

884

DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

885

DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

886

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

887

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

888

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

889

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

890

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

891

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

892

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

893

DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

894

DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

895

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

896

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

897

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

898

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

899

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

900

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

901

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

902

DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

903

DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

904

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

905

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

906

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

907

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

908

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

909

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));

910

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));

911

DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));

912

DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

913

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

914

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));

915

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));

916

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));

917

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));

918

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));

919

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));

920

DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));

921

DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

922

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

923

DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));

924

DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));

925

DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));

926

DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));

927

DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));

928

DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));

929

DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));

930

DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

931

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

932

DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));

933

DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));

934

DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));

935

DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));

936

DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));

937

DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));

938

DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));

939

DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

940

941

// Compute the 8x4 intermediate tensor

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

942

VEC_DATA_TYPE(float, 4)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

943

comm_fact0, comm_fact1, comm_fact2;

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

944

VEC_DATA_TYPE(float, 4)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

945

tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

946

947

COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);

948

COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);

949

COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);

950

COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);

951

COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);

952

COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);

953

COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);

954

COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);

955

956

// Compute the 4x4 output tile

957

comm_fact0 = tmp_col1 + tmp_col2;

958

comm_fact1 = tmp_col3 + tmp_col4;

959

comm_fact2 = tmp_col5 + tmp_col6;

960

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

961

VEC_DATA_TYPE(float, 4)

962

out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0;

963

VEC_DATA_TYPE(float, 4)

964

out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

965

966

comm_fact0 = tmp_col1 - tmp_col2;

967

comm_fact1 = tmp_col3 - tmp_col4;

968

comm_fact2 = tmp_col5 - tmp_col6;

969

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

970

VEC_DATA_TYPE(float, 4)

971

out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2;

972

VEC_DATA_TYPE(float, 4)

973

out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

974

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

975

#if defined(HAS_BIAS)

976

// Add bias

977

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

978

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

979

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

980

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

981

out_col0 += (VEC_DATA_TYPE(float, 4))b;

982

out_col1 += (VEC_DATA_TYPE(float, 4))b;

983

out_col2 += (VEC_DATA_TYPE(float, 4))b;

984

out_col3 += (VEC_DATA_TYPE(float, 4))b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

985

#endif // defined(HAS_BIAS)

986

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

987

// Store the output tile

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

988

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

989

(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

990

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

991

(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

992

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

993

(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

994

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

995

(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

996

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

997

}

998

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

999

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1000

*

1001

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1002

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1003

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1004

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

1005

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

1006

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1007

* @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1008

* @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1009

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1010

* @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1011

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1012

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1013

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1014

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1015

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1016

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1017

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1018

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1019

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1020

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1021

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1022

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1023

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1024

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1025

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1026

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1027

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1028

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1029

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1030

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1031

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1032

*/

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

1033

__kernel void winograd_output_transform_4x4_5x5_nhwc(

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1034

TENSOR4D(src, BUFFER),

1035

TENSOR4D(dst, BUFFER),

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1036

#if defined(HAS_BIAS)

1037

VECTOR_DECLARATION(bias),

1038

#endif // defined(HAS_BIAS)

1039

int dst_size)

1040

{

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1041

const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM

1042

const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES

1043

const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1044

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1045

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Sheri Zhang

2021-04-12 10:53:57 +0100

[diff] [blame]

1046

TILE(DATA_TYPE, 8, N0, in) = { { { 0 } } };

1047

TILE(DATA_TYPE, 4, N0, out) = { { { 0 } } };

1048

TILE(DATA_TYPE, 4, N0, tmp) = { { { 0 } } };

1049

TILE(uint, 8, 1, src_indirect_y) = { { { 0 } } };

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1050

1051

LOOP_UNROLLING(int, i, 0, 8, 1)

1052

{

1053

src_indirect_y[i].v = mout + i * SRC_HEIGHT;

1054

src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8);

1055

}

1056

1057

// "in" contains 1x8 or 8x1 tile here

1058

T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

1059

1060

// A^T * in, and in this degenerate case out consists of 1 column/row

1061

tmp[0].v = in[1].v - in[2].v;

1062

tmp[1].v = 2.0f * (in[3].v - in[4].v);

1063

tmp[2].v = 2.0f * (in[5].v + in[6].v);

1064

tmp[3].v = in[3].v + in[4].v;

1065

out[0].v = in[0].v + in[1].v + in[2].v + tmp[3].v + 4.0f * tmp[2].v;

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

1066

out[1].v = tmp[0].v + tmp[1].v + 4.0f * (in[5].v - in[6].v);

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1067

out[2].v = in[1].v + in[2].v + 4.0f * tmp[3].v + tmp[2].v;

1068

out[3].v = tmp[0].v + 4.0f * tmp[1].v + in[5].v - in[6].v + in[7].v;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1069

1070

#if defined(HAS_BIAS)

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1071

TILE(DATA_TYPE, 1, N0, b);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1072

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame^]

1073

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1074

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1075

// c = c + bias[broadcasted]

1076

T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);

1077

#endif // HAS_BIAS

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1078

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1079

int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;

1080

int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;

1081

1082

T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

1083

Sheri Zhang

2021-04-12 10:53:57 +0100

[diff] [blame]

1084

TILE(uint, 4, 1, dst_indirect_y) = { { { 0 } } };

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1085

1086

// Calculate the destination indirect Y

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1087

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1088

LOOP_UNROLLING(int, yk, 0, 4, 1)

1089

{

1090

int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));

1091

dst_indirect_y[yk].v = x_out + y_c * DST_WIDTH;

1092

dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

1093

}

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

1094

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1095

LOOP_UNROLLING(int, xk, 0, 4, 1)

1096

{

1097

int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));

1098

dst_indirect_y[xk].v = x_c + y_out * DST_WIDTH;

1099

dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

1100

}

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1101

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1102

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1103

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

1104

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

1105

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1106

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1107

// Calculate the indirect Y for the source tensor

Sheri Zhang

2021-04-12 10:53:57 +0100

[diff] [blame]

1108

TILE(DATA_TYPE, 64, N0, in) = { { { 0 } } };

1109

TILE(DATA_TYPE, 6, N0, tmp) = { { { 0 } } };

1110

TILE(uint, 64, 1, src_indirect_y) = { { { 0 } } };

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1111

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1112

LOOP_UNROLLING(int, i, 0, 64, 1)

1113

{

1114

src_indirect_y[i].v = mout + i * SRC_HEIGHT;

1115

src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64);

1116

}

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1117

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1118

// "in" here is 8x8 tile

1119

T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1120

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1121

// A^T * in

1122

LOOP_UNROLLING(int, i, 0, 8, 1)

1123

{

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

1124

tmp[0].v = in[8 + i].v + in[16 + i].v;

1125

tmp[1].v = in[8 + i].v - in[16 + i].v;

1126

tmp[2].v = in[24 + i].v + in[32 + i].v;

1127

tmp[3].v = in[24 + i].v - in[32 + i].v;

1128

tmp[3].v = tmp[3].v + tmp[3].v;

1129

tmp[4].v = in[40 + i].v + in[48 + i].v;

1130

tmp[4].v = tmp[4].v + tmp[4].v;

1131

tmp[5].v = in[40 + i].v - in[48 + i].v;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1132

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1133

// 4x8 matrix as a result

1134

in[i].v = in[i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);

1135

in[8 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);

1136

in[16 + i].v = tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[4].v);

1137

in[24 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[5].v) + in[56 + i].v;

1138

}

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1139

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1140

// Compute the output tile

Sheri Zhang

2021-04-12 10:53:57 +0100

[diff] [blame]

1141

TILE(DATA_TYPE, 16, N0, out) = { { { 0 } } };

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1142

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1143

// in * A, with in = A^T * in as above

1144

LOOP_UNROLLING(int, i, 0, 4, 1)

1145

{

Giorgio Arena

2021-04-08 16:41:51 +0100

[diff] [blame]

1146

tmp[0].v = in[8 * i + 1].v + in[8 * i + 2].v;

1147

tmp[1].v = in[8 * i + 1].v - in[8 * i + 2].v;

1148

tmp[2].v = in[8 * i + 3].v + in[8 * i + 4].v;

1149

tmp[3].v = in[8 * i + 3].v - in[8 * i + 4].v;

1150

tmp[3].v = tmp[3].v + tmp[3].v;

1151

tmp[4].v = in[8 * i + 5].v + in[8 * i + 6].v;

1152

tmp[4].v = tmp[4].v + tmp[4].v;

1153

tmp[5].v = in[8 * i + 5].v - in[8 * i + 6].v;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1154

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1155

// 4x4 tile

1156

out[4 * i].v = in[8 * i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);

1157

out[4 * i + 1].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);

1158

out[4 * i + 2].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[0].v) + tmp[4].v;

1159

out[4 * i + 3].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[1].v) + tmp[5].v + in[8 * i + 7].v;

1160

}

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1161

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1162

#if defined(HAS_BIAS)

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1163

TILE(DATA_TYPE, 1, N0, b);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1164

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame^]

1165

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1166

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1167

// c = c + bias[broadcasted]

1168

T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);

1169

#endif // HAS_BIAS

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

1170

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1171

int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;

1172

int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1173

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1174

T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1175

Sheri Zhang

2021-04-12 10:53:57 +0100

[diff] [blame]

1176

TILE(uint, 16, 1, dst_indirect_y) = { { { 0 } } };

Aleksandr Nikolaev

2021-04-08 10:06:34 +0100

[diff] [blame]

1177

1178

// Calculate the destination indirect Y

1179

LOOP_UNROLLING(int, yk, 0, 4, 1)

1180

{

1181

LOOP_UNROLLING(int, xk, 0, 4, 1)

1182

{

1183

int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));

1184

int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));

1185

dst_indirect_y[xk + yk * 4].v = x_c + y_c * DST_WIDTH;

1186

dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

}

}

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

1191

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1192

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1193

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1194

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1195

1196

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1197

#if defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1198

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW

1199

*

1200

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1201

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

1202

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1203

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1204

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1205

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1206

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1207

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1208

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1209

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1210

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1211

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1212

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1213

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1214

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1215

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1216

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1217

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1218

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1219

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1220

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1221

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1222

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1223

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1224

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1225

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1226

*/

1227

__kernel void winograd_output_transform_2x1_3x1_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1228

TENSOR4D_DECLARATION(src),

1229

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1230

#if defined(HAS_BIAS)

1231

,

1232

VECTOR_DECLARATION(bias)

1233

#endif // defined(HAS_BIAS)

1234

)

1235

{

1236

winograd_output_transform_2x2_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1243

src_stride_w,

1244

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1245

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1253

dst_stride_w,

1254

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1255

dst_offset_first_element_in_bytes

1256

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1262

#endif // defined(HAS_BIAS)

1263

);

1264

}

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

1265

1266

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC

1267

*

1268

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1269

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

1270

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

1271

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

1272

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

1273

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1274

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

1275

*

1276

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

1277

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1278

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1279

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1280

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1281

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1282

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1283

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1284

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1285

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1286

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1287

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1288

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1289

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1290

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1291

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1292

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1293

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1294

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1295

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1296

*/

1297

__kernel void winograd_output_transform_2x1_7x1_nhwc(

1298

TENSOR4D_DECLARATION(src),

1299

TENSOR4D_DECLARATION(dst),

1300

#if defined(HAS_BIAS)

1301

VECTOR_DECLARATION(bias),

1302

#endif // defined(HAS_BIAS)

1303

int dst_size)

1304

{

1305

winograd_output_transform_2x2_7x7_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

1325

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1330

#endif // defined(HAS_BIAS)

1331

dst_size);

1332

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1333

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1334

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1335

#if defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1336

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW

1337

*

1338

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1339

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1340

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1341

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1342

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1343

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1344

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1345

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1346

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1347

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1348

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1349

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1350

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1351

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1352

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1353

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1354

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1355

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1356

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1357

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1358

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1359

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1360

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1361

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1362

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1363

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1364

*/

1365

__kernel void winograd_output_transform_4x1_3x1_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1366

TENSOR4D_DECLARATION(src),

1367

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1368

#if defined(HAS_BIAS)

1369

,

1370

VECTOR_DECLARATION(bias)

1371

#endif // defined(HAS_BIAS)

1372

)

1373

{

1374

winograd_output_transform_4x4_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1381

src_stride_w,

1382

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1383

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1391

dst_stride_w,

1392

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1393

dst_offset_first_element_in_bytes

1394

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1400

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW

1405

*

1406

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1407

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1408

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1409

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1410

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1411

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1412

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1413

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1414

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1415

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1416

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1417

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1418

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1419

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1420

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1421

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1422

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1423

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1424

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1425

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1426

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1427

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1428

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1429

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1430

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1431

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1432

*/

1433

__kernel void winograd_output_transform_4x1_5x1_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1434

TENSOR4D_DECLARATION(src),

1435

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1436

#if defined(HAS_BIAS)

1437

,

1438

VECTOR_DECLARATION(bias)

1439

#endif // defined(HAS_BIAS)

1440

)

1441

{

1442

winograd_output_transform_4x4_5x5_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1449

src_stride_w,

1450

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1451

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1459

dst_stride_w,

1460

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1461

dst_offset_first_element_in_bytes

1462

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1468

#endif // defined(HAS_BIAS)

1469

);

1470

}

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1471

1472

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC

1473

*

1474

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1475

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1476

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

1477

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

1478

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1479

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1480

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1481

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1482

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1483

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1484

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1485

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1486

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1487

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1488

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1489

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1490

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1491

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1492

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1493

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1494

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1495

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1496

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1497

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1498

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1499

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1500

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1501

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1502

*/

1503

__kernel void winograd_output_transform_4x1_3x1_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1504

TENSOR4D_DECLARATION(src),

1505

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1506

#if defined(HAS_BIAS)

1507

VECTOR_DECLARATION(bias),

1508

#endif // defined(HAS_BIAS)

1509

int dst_size)

1510

{

1511

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1518

src_stride_w,

1519

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1520

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1528

dst_stride_w,

1529

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1530

dst_offset_first_element_in_bytes,

1531

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1536

#endif // defined(HAS_BIAS)

dst_size);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC

1541

*

1542

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1543

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1544

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

1545

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

1546

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1547

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1548

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1549

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1550

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1551

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1552

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1553

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1554

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1555

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1556

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1557

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1558

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1559

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1560

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1561

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1562

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1563

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1564

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1565

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1566

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1567

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1568

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1569

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1570

*/

1571

__kernel void winograd_output_transform_4x1_5x1_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1572

TENSOR4D_DECLARATION(src),

1573

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1574

#if defined(HAS_BIAS)

1575

VECTOR_DECLARATION(bias),

1576

#endif // defined(HAS_BIAS)

1577

int dst_size)

1578

{

1579

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1586

src_stride_w,

1587

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1588

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1596

dst_stride_w,

1597

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1598

dst_offset_first_element_in_bytes,

1599

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1604

#endif // defined(HAS_BIAS)

1605

dst_size);

1606

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1607

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1608

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

1609

1610

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1611

#if defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1612

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW

1613

*

1614

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1615

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1616

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

1617

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1618

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1619

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1620

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1621

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1622

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1623

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1624

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1625

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1626

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1627

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1628

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1629

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1630

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1631

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1632

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1633

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1634

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1635

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1636

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1637

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1638

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1639

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1640

*/

1641

__kernel void winograd_output_transform_1x2_1x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1642

TENSOR4D_DECLARATION(src),

1643

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1644

#if defined(HAS_BIAS)

1645

,

1646

VECTOR_DECLARATION(bias)

1647

#endif // defined(HAS_BIAS)

1648

)

1649

{

1650

winograd_output_transform_2x2_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1657

src_stride_w,

1658

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1659

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1667

dst_stride_w,

1668

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1669

dst_offset_first_element_in_bytes

1670

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1676

#endif // defined(HAS_BIAS)

1677

);

1678

}

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

1679

1680

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC

1681

*

1682

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1683

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1684

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

1685

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

1686

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

1687

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

1688

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

1689

*

1690

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

1691

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1692

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1693

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1694

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1695

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1696

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1697

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1698

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1699

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1700

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1701

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1702

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1703

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1704

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1705

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1706

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1707

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1708

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1709

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1710

*/

1711

__kernel void winograd_output_transform_1x2_1x7_nhwc(

1712

TENSOR4D_DECLARATION(src),

1713

TENSOR4D_DECLARATION(dst),

1714

#if defined(HAS_BIAS)

1715

VECTOR_DECLARATION(bias),

1716

#endif // defined(HAS_BIAS)

1717

int dst_size)

1718

{

1719

winograd_output_transform_2x2_7x7_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

1739

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1744

#endif // defined(HAS_BIAS)

1745

dst_size);

1746

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1747

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1748

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1749

#if defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1750

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW

1751

*

1752

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1753

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1754

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1755

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1756

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1757

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1758

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1759

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1760

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1761

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1762

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1763

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1764

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1765

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1766

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1767

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1768

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1769

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1770

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1771

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1772

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1773

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1774

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1775

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1776

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1777

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1778

*/

1779

__kernel void winograd_output_transform_1x4_1x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1780

TENSOR4D_DECLARATION(src),

1781

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1782

#if defined(HAS_BIAS)

1783

,

1784

VECTOR_DECLARATION(bias)

1785

#endif // defined(HAS_BIAS)

1786

)

1787

{

1788

winograd_output_transform_4x4_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1795

src_stride_w,

1796

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1797

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1805

dst_stride_w,

1806

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1807

dst_offset_first_element_in_bytes

1808

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1814

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW

1819

*

1820

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1821

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1822

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1823

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1824

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1825

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1826

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1827

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1828

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1829

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1830

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1831

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1832

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1833

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1834

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1835

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1836

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1837

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1838

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1839

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1840

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1841

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1842

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1843

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1844

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1845

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1846

*/

1847

__kernel void winograd_output_transform_1x4_1x5_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1848

TENSOR4D_DECLARATION(src),

1849

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1850

#if defined(HAS_BIAS)

1851

,

1852

VECTOR_DECLARATION(bias)

1853

#endif // defined(HAS_BIAS)

1854

)

1855

{

1856

winograd_output_transform_4x4_5x5_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1863

src_stride_w,

1864

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1865

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1873

dst_stride_w,

1874

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1875

dst_offset_first_element_in_bytes

1876

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1882

#endif // defined(HAS_BIAS)

1883

);

1884

}

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1885

1886

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC

1887

*

1888

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1889

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1890

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

1891

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

1892

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1893

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1894

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1895

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1896

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1897

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1898

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1899

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1900

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1901

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1902

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1903

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1904

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1905

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1906

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1907

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1908

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1909

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1910

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1911

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1912

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1913

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1914

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1915

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1916

*/

1917

__kernel void winograd_output_transform_1x4_1x3_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1918

TENSOR4D_DECLARATION(src),

1919

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1920

#if defined(HAS_BIAS)

1921

VECTOR_DECLARATION(bias),

1922

#endif // defined(HAS_BIAS)

1923

int dst_size)

1924

{

1925

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1932

src_stride_w,

1933

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1934

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1942

dst_stride_w,

1943

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1944

dst_offset_first_element_in_bytes,

1945

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1950

#endif // defined(HAS_BIAS)

dst_size);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC

1955

*

1956

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1957

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1958

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

1959

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

1960

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1961

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1962

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1963

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1964

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1965

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1966

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1967

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1968

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1969

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1970

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1971

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1972

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1973

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1974

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1975

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1976

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1977

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1978

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1979

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1980

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1981

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1982

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1983

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1984

*/

1985

__kernel void winograd_output_transform_1x4_1x5_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1986

TENSOR4D_DECLARATION(src),

1987

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1988

#if defined(HAS_BIAS)

1989

VECTOR_DECLARATION(bias),

1990

#endif // defined(HAS_BIAS)

1991

int dst_size)

1992

{

1993

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2000

src_stride_w,

2001

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2002

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2010

dst_stride_w,

2011

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2012

dst_offset_first_element_in_bytes,

2013

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

2018

#endif // defined(HAS_BIAS)

2019

dst_size);

2020

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

2021

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2022

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Georgios Pinitas