Blame - src/core/CL/cl_kernels/winograd_output_transform.cl - ml/ComputeLibrary

2018-08-31 16:10:16 +0100

[diff] [blame]

37

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

38

* @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

39

* @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.

40

* @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

41

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

42

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

43

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

44

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

45

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

46

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

47

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

48

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

49

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

50

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

51

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

52

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

53

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

54

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

55

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

56

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

57

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

58

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

59

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

60

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

61

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

62

*/

63

__kernel void winograd_output_transform_2x2_3x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

64

TENSOR4D_DECLARATION(src),

65

TENSOR4D_DECLARATION(dst)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

66

#if defined(HAS_BIAS)

67

,

68

VECTOR_DECLARATION(bias)

69

#endif // defined(HAS_BIAS)

70

)

71

{

72

// Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

73

#if defined(SRC_DEPTH)

74

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

75

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

76

#else /* defined(SRC_DEPTH) */

77

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

78

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

79

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

80

81

// Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

82

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

83

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

84

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

85

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

86

87

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

88

// Compute the 2x1 or 1x2 output tile

89

// out00 = d00 + d01 + d02

90

// out01 = d01 - d02 - d03

91

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

92

float out00 = d00 + d01 + d02;

93

float out01 = d01 - d02 - d03;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

94

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

95

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

96

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

97

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

98

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

99

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

100

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

101

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

102

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

103

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

104

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

105

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

106

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

107

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

108

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

109

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

110

111

// Compute the 2x2 output tile

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

112

float k0 = d01 + d11 + d21;

113

float k1 = d02 + d12 + d22;

114

float k2 = d11 - d21 - d31;

115

float k3 = d12 - d22 - d32;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

116

117

// out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22

118

// out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)

119

// out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)

120

// out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)

121

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

float out00 = d10;

float out01 = -d13;

float out10 = d10;

float out11 = -d13;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

126

127

out00 += d00 + d20 + k0 + k1;

128

out01 += k0 - k1 - (d03 + d23);

129

out10 += -d20 - d30 + k2 + k3;

130

out11 += k2 - k3 + d23 + d33;

131

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

132

133

int y_in = get_global_id(1);

134

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

135

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

136

int z_out = get_global_id(0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

137

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

138

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

139

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

140

141

#if defined(HAS_BIAS)

142

// Add bias

143

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

144

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

145

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

146

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

147

out00 += (float)b;

148

out01 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

149

#endif // defined(HAS_BIAS)

150

151

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

152

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

153

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

154

#else /* defined(SRC_DEPTH) */

155

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

156

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

157

158

// Store the output tile

159

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Michalis Spyrou

ef6ec50

2020-07-31 11:38:36 +0100

[diff] [blame]

160

const VEC_DATA_TYPE(DATA_TYPE, 2)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

161

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

162

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;

163

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

164

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

165

vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

166

(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

167

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

168

169

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

170

#if defined(HAS_BIAS)

171

// Add bias

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

172

out10 += (DATA_TYPE)b;

173

out11 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

174

#endif // defined(HAS_BIAS)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

175

vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

176

(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

177

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

178

}

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

179

180

#define COMPUTE_TMP_COL_2x2_7x7(col, d0, d1, d2, d3, d4, d5, d6, d7) \

181

({ \

182

col.s0 = d0 + d1 + d2 + d3 + d4 + d5 + d6; \

183

col.s1 = -d1 + d2 - 2 * d3 + 2 * d4 + -3 * d5 + 3 * d6 + d7; \

184

})

185

186

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC

187

*

188

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

189

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

190

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

191

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

192

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

193

* @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

194

* @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

195

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

196

*

197

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

198

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

199

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

200

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

201

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

202

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

203

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

204

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

205

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

206

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

207

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

208

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

209

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

210

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

211

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

212

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

213

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

214

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

215

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

216

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

217

*/

218

__kernel void winograd_output_transform_2x2_7x7_nhwc(

219

TENSOR4D_DECLARATION(src),

220

TENSOR4D_DECLARATION(dst),

221

#if defined(HAS_BIAS)

222

VECTOR_DECLARATION(bias),

223

#endif // defined(HAS_BIAS)

224

int dst_size)

225

{

226

// Each thread stores a 4x4/4x1 or 1x4 tile

227

#if defined(SRC_DEPTH)

228

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

229

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

230

#else /* defined(SRC_DEPTH) */

231

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

232

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

233

#endif /* defined(SRC_DEPTH) */

234

235

int y_in = get_global_id(1);

236

int x_out = get_global_id(0);

237

int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

238

int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

239

#if defined(SRC_DEPTH)

240

int batch = get_global_id(2) / SRC_DEPTH;

241

#endif /* defined(SRC_DEPTH) */

242

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

243

__global unsigned char *dst_base_ptr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE);

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

244

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

245

#if defined(SRC_DEPTH)

246

dst_base_ptr += batch * dst_stride_w;

247

#endif // defined(SRC_DEPTH)

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

248

249

// Load the values across the channels to compose the input tile

250

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

251

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

252

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

253

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

254

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

255

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

256

DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

257

DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

258

259

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

260

// Compute out00, out01, out02 and out03

261

float out00 = d00 + d01 + d02 + d03 + d04 + d05 + d06;

262

float out01 = -d01 + d02 - 2.f * d03 + 2.0f * d04 - 3.0f * d05 + 3.0f * d06 + d07;

263

264

#if defined(HAS_BIAS)

265

// Add bias

266

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

267

268

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

out00 += (float)b;

out01 += (float)b;

#endif // defined(HAS_BIAS)

273

274

// Store the output tile

275

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

276

277

dst_base_ptr += y_out * dst_stride_y;

278

279

int2 offset_z = min((int2)z_out + (int2)(0, 1), (int2)((int)DST_HEIGHT - 1)) * (int2)dst_stride_z;

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

280

281

VEC_DATA_TYPE(DATA_TYPE, 2)

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

282

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

283

284

// To avoid the out-of-bound write, we store the elements in reverse order so the invalid element

285

// is overwritten with the valid one

286

*(__global DATA_TYPE *)(dst_base_ptr + offset_z.s1) = out0_dt.s1;

287

*(__global DATA_TYPE *)(dst_base_ptr + offset_z.s0) = out0_dt.s0;

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

288

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

289

290

dst_base_ptr += z_out * dst_stride_z;

291

292

int2 offset_y = min((int2)y_out + (int2)(0, 1), (int2)((int)DST_WIDTH - 1)) * (int2)dst_stride_y;

293

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

294

VEC_DATA_TYPE(DATA_TYPE, 2)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

295

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL,

296

B_VAL);

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

297

298

// To avoid the out-of-bound write, we store the elements in reverse order so the invalid element

299

// is overwritten with the valid one

300

*(__global DATA_TYPE *)(dst_base_ptr + offset_y.s1) = out0_dt.s1;

301

*(__global DATA_TYPE *)(dst_base_ptr + offset_y.s0) = out0_dt.s0;

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

302

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

303

304

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

305

306

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

307

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

308

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

309

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

310

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

311

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

312

DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

313

DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

314

315

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

316

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

317

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

318

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

319

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

320

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

321

DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

322

DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

323

324

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

325

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

326

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

327

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

328

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

329

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

330

DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

331

DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

332

333

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

334

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

335

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

336

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

337

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));

338

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));

339

DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));

340

DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));

341

342

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));

343

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));

344

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));

345

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));

346

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));

347

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));

348

DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));

349

DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));

350

351

DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));

352

DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));

353

DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));

354

DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));

355

DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));

356

DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));

357

DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));

358

DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));

359

360

DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));

361

DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));

362

DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));

363

DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));

364

DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));

365

DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));

366

DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));

367

DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));

368

369

// Compute the 8x2 intermediate tensor

370

VEC_DATA_TYPE(float, 2)

371

tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;

372

373

COMPUTE_TMP_COL_2x2_7x7(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70);

374

COMPUTE_TMP_COL_2x2_7x7(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71);

375

COMPUTE_TMP_COL_2x2_7x7(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72);

376

COMPUTE_TMP_COL_2x2_7x7(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73);

377

COMPUTE_TMP_COL_2x2_7x7(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74);

378

COMPUTE_TMP_COL_2x2_7x7(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75);

379

COMPUTE_TMP_COL_2x2_7x7(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76);

380

COMPUTE_TMP_COL_2x2_7x7(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77);

381

382

// Compute the 2x2 output tile

383

VEC_DATA_TYPE(float, 2)

384

out_col0 = tmp_col0 + tmp_col1 + tmp_col2 + tmp_col3 + tmp_col4 + tmp_col5 + tmp_col6;

385

VEC_DATA_TYPE(float, 2)

386

out_col1 = -tmp_col1 + tmp_col2 - 2 * tmp_col3 + 2 * tmp_col4 - 3 * tmp_col5 + 3 * tmp_col6 + tmp_col7;

387

388

#if defined(HAS_BIAS)

389

// Add bias

390

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

391

392

DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

393

394

out_col0 += (VEC_DATA_TYPE(float, 2))b;

395

out_col1 += (VEC_DATA_TYPE(float, 2))b;

396

397

#endif // defined(HAS_BIAS)

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

398

399

int2 offset_y = min((int2)y_out + (int2)(0, 1), (int2)((int)DST_WIDTH - 1)) * (int2)dst_stride_y;

400

int2 offset_z = min((int2)z_out + (int2)(0, 1), (int2)((int)DST_HEIGHT - 1)) * (int2)dst_stride_z;

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

401

402

// Store the output tile

403

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

404

out_col0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

405

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

406

out_col1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

407

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

408

// To avoid the out-of-bound write, we store the elements in reverse order so the invalid element

409

// is overwritten with the valid one

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

410

*(__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s1) = out_col1_dt.s1;

411

*(__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s0) = out_col1_dt.s0;

412

*(__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s1) = out_col0_dt.s1;

413

*(__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s0) = out_col0_dt.s0;

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

414

415

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

416

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

417

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

418

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

419

#if defined(VEC_SIZE) && VEC_SIZE == 4

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

420

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW

421

*

422

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

423

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

424

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

425

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

426

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

427

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

428

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

429

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

430

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

431

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

432

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

433

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

434

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

435

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

436

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

437

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

438

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

439

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

440

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

441

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

442

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

443

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

444

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

445

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

446

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

447

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

448

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

449

*/

450

__kernel void winograd_output_transform_4x4_3x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

451

TENSOR4D_DECLARATION(src),

452

TENSOR4D_DECLARATION(dst)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

453

#if defined(HAS_BIAS)

454

,

455

VECTOR_DECLARATION(bias)

456

#endif // defined(HAS_BIAS)

457

)

458

{

459

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

460

#if defined(SRC_DEPTH)

461

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

462

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

463

#else /* defined(SRC_DEPTH) */

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

464

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

465

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

466

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

467

468

// Load the values across the channels to compose the 6x6 or 6x1 tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

469

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

470

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

471

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

472

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

473

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

474

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

475

476

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

477

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

478

float out00 = d00 + d01 + d02 + d03 + d04;

479

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;

480

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;

481

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

482

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

483

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

484

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

485

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

486

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

487

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

488

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

489

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

490

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

491

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

492

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

493

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

494

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

495

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

496

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

497

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

498

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

499

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

500

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

501

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

502

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

503

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

504

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

505

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

506

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

507

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

508

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

509

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

510

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

511

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

512

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

513

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

514

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

515

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

516

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

517

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

518

519

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

520

float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;

521

float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;

522

float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;

523

float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

524

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

525

float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;

526

float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

527

528

out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;

529

out01 += k1 - d02 - d12 - d22 - d32 - d42;

530

out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;

531

out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;

532

533

// Compute out10, out11, out12 and out13

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

534

float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

535

float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

536

float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

537

float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

538

539

k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;

540

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;

541

542

out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;

543

out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;

544

out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;

545

out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;

546

547

// Compute out20, out21, out22 and out23

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

548

float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

549

float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

550

float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

551

float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

552

553

k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;

554

k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;

555

556

out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;

557

out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;

558

out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;

559

out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;

560

561

// Compute out30, out31, out32 and out33

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

562

float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

563

float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

564

float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

565

float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

566

567

k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;

568

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;

569

570

out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;

571

out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;

572

out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;

573

out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;

574

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

575

576

int y_in = get_global_id(1);

577

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

578

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

579

int z_out = get_global_id(0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

580

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

581

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

582

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

583

584

#if defined(HAS_BIAS)

585

// Add bias

586

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

587

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

588

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

589

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out00 += (float)b;

out01 += (float)b;

out02 += (float)b;

out03 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

594

#endif // defined(HAS_BIAS)

595

596

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

597

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

598

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

599

#else /* defined(SRC_DEPTH) */

600

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

601

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

602

603

// Store the output tile

604

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

605

VEC_DATA_TYPE(DATA_TYPE, 4)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

606

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,

607

B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

608

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;

609

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;

610

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;

611

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

612

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

613

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

614

(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

615

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

616

617

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

618

#if defined(HAS_BIAS)

619

// Add bias

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out10 += (float)b;

out11 += (float)b;

out12 += (float)b;

out13 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

624

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out20 += (float)b;

out21 += (float)b;

out22 += (float)b;

out23 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

629

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out30 += (float)b;

out31 += (float)b;

out32 += (float)b;

out33 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

634

#endif // defined(HAS_BIAS)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

635

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

636

(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

637

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

638

(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

639

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

640

(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

641

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

642

}

643

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

644

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

645

*

646

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

647

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

648

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

649

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

650

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

651

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

652

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

653

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

654

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

655

* @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

656

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

657

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

658

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

659

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

660

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

661

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

662

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

663

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

664

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

665

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

666

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

667

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

668

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

669

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

670

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

671

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

672

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

673

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

674

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

675

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

676

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

677

* @param[in] dst_size Size of the destination tensor, minus the last padding

678

*/

679

__kernel void winograd_output_transform_4x4_3x3_nhwc(

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

680

TENSOR4D(src, BUFFER),

681

TENSOR4D(dst, BUFFER),

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

682

#if defined(HAS_BIAS)

683

VECTOR_DECLARATION(bias),

684

#endif // defined(HAS_BIAS)

685

int dst_size)

686

{

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

687

const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM

688

const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES

689

const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

690

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

691

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

692

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

693

TILE(DATA_TYPE, 6, N0, in) = { { 0 } };

694

TILE(DATA_TYPE, 4, N0, out) = { { 0 } };

695

TILE(uint, 6, 1, src_indirect_y) = { { 0 } };

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

696

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

697

LOOP_UNROLLING(int, i, 0, 6, 1)

698

{

699

src_indirect_y[i].v = mout + i * SRC_HEIGHT;

700

src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6);

701

}

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

702

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

703

// Load the values across the 36 channels to compose the 6x6 or 6x1 tile

704

T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

705

706

// Compute out00, out01, out02 and out03

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

707

out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v;

708

out[1].v = in[1].v - in[2].v + 2.0f * in[3].v - 2.0f * in[4].v;

709

out[2].v = in[1].v + in[2].v + 4.0f * in[3].v + 4.0f * in[4].v;

710

out[3].v = in[1].v - in[2].v + 8.0f * in[3].v - 8.0f * in[4].v + in[5].v;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

711

712

#if defined(HAS_BIAS)

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

713

TILE(DATA_TYPE, 1, N0, b);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

714

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

715

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 0, b);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

716

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

717

// c = c + bias[broadcasted]

718

T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);

719

#endif // HAS_BIAS

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

720

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

721

int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;

722

int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

723

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

724

T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

725

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

726

TILE(uint, 4, 1, dst_indirect_y) = { { 0 } };

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

727

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

728

// Calculate the destination indirect Y

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

729

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

730

LOOP_UNROLLING(int, yk, 0, 4, 1)

731

{

732

int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));

733

dst_indirect_y[yk].v = x_out + y_c * DST_WIDTH;

734

dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

735

}

736

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

737

LOOP_UNROLLING(int, xk, 0, 4, 1)

738

{

739

int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));

740

dst_indirect_y[xk].v = x_c + y_out * DST_WIDTH;

741

dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

742

}

743

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

744

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

745

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

746

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

747

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

748

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

749

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

750

// Calculate the indirect Y for the source tensor

751

TILE(DATA_TYPE, 36, N0, in) = { { 0 } };

752

TILE(DATA_TYPE, 4, N0, tmp) = { { 0 } };

753

TILE(uint, 36, 1, src_indirect_y) = { { 0 } };

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

754

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

755

LOOP_UNROLLING(int, i, 0, 36, 1)

756

{

757

src_indirect_y[i].v = mout + i * SRC_HEIGHT;

758

src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36);

759

}

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

760

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

761

// Load the values across the 36 channels to compose the 6x6 or 6x1 tile

762

T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

763

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

764

LOOP_UNROLLING(int, i, 0, 6, 1)

765

{

766

tmp[0].v = in[6 + i].v + in[12 + i].v;

767

tmp[1].v = in[6 + i].v - in[12 + i].v;

768

tmp[2].v = in[18 + i].v + in[24 + i].v;

769

tmp[3].v = in[18 + i].v - in[24 + i].v;

770

tmp[3].v = tmp[3].v + tmp[3].v;

771

in[i].v = in[i].v + tmp[0].v + tmp[2].v;

772

in[6 + i].v = tmp[3].v + tmp[1].v;

773

in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);

774

in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v;

775

}

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

776

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

777

// Compute the output tile

778

TILE(DATA_TYPE, 16, N0, out) = { { 0 } };

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

779

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

780

LOOP_UNROLLING(int, i, 0, 4, 1)

781

{

782

tmp[0].v = in[6 * i + 1].v + in[6 * i + 2].v;

783

tmp[1].v = in[6 * i + 1].v - in[6 * i + 2].v;

784

tmp[2].v = in[6 * i + 3].v + in[6 * i + 4].v;

785

tmp[3].v = in[6 * i + 3].v - in[6 * i + 4].v;

786

tmp[3].v = tmp[3].v + tmp[3].v;

787

out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v;

788

out[4 * i + 1].v = tmp[3].v + tmp[1].v;

789

out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);

790

out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v;

791

}

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

792

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

793

#if defined(HAS_BIAS)

794

TILE(DATA_TYPE, 1, N0, b);

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

795

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

796

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 0, b);

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

797

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

798

// c = c + bias[broadcasted]

799

T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);

800

#endif // HAS_BIAS

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

801

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

802

int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;

803

int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

804

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame^]

805

T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

806

807

TILE(uint, 16, 1, dst_indirect_y) = { { 0 } };

808

809

// Calculate the destination indirect Y

810

LOOP_UNROLLING(int, yk, 0, 4, 1)

811

{

812

LOOP_UNROLLING(int, xk, 0, 4, 1)

813

{

814

int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));

815

int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));

816

dst_indirect_y[xk + yk * 4].v = x_c + y_c * DST_WIDTH;

817

dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

}

}

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

822

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

823

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

824

}

825

826

#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \

827

({ \

828

comm_fact.s0 = d1 + d2; \

829

comm_fact.s1 = d3 + d4; \

830

comm_fact.s2 = d5 + d6; \

831

\

832

col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \

833

col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \

834

\

835

comm_fact.s0 = d1 - d2; \

836

comm_fact.s1 = d3 - d4; \

837

comm_fact.s2 = d5 - d6; \

838

\

839

col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \

840

col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \

841

})

842

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

843

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

844

*

845

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

846

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

847

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

848

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

849

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

850

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

851

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

852

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

853

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

854

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

855

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

856

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

857

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

858

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

859

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

860

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

861

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

862

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

863

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

864

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

865

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

866

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

867

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

868

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

869

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

870

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

871

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

872

*/

873

__kernel void winograd_output_transform_4x4_5x5_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

874

TENSOR4D_DECLARATION(src),

875

TENSOR4D_DECLARATION(dst)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

876

#if defined(HAS_BIAS)

877

,

878

VECTOR_DECLARATION(bias)

879

#endif // defined(HAS_BIAS)

880

)

881

{

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

882

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

883

#if defined(SRC_DEPTH)

884

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

885

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

886

#else /* defined(SRC_DEPTH) */

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

887

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

888

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

889

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

890

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

891

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

892

// Compute output address

893

int y_in = get_global_id(1);

894

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

895

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

896

int z_out = get_global_id(0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

897

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

898

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

899

#endif /* defined(SRC_DEPTH) */

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

900

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

901

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

902

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

903

#else /* defined(SRC_DEPTH) */

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

904

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

905

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

906

#endif /* defined(SRC_DEPTH) */

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

907

908

// Load the values across the channels to compose the input tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

909

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

910

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

911

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

912

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

913

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

914

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

915

DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

916

DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

917

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

918

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

919

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

920

float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;

921

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;

922

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;

923

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

924

925

#if defined(HAS_BIAS)

926

// Add bias

927

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

928

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

929

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

930

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

931

out00 += (DATA_TYPE)b;

932

out01 += (DATA_TYPE)b;

933

out02 += (DATA_TYPE)b;

934

out03 += (DATA_TYPE)b;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

935

#endif // defined(HAS_BIAS)

936

937

// Store the output tile

938

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

939

VEC_DATA_TYPE(DATA_TYPE, 4)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

940

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,

941

B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

942

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;

943

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;

944

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;

945

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

946

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

947

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

948

(__global DATA_TYPE *)(dst_addr));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

949

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

950

951

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

952

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

953

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

954

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

955

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

956

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

957

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

958

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

959

DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

960

DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

961

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

962

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

963

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

964

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

965

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

966

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

967

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

968

DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

969

DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

970

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

971

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

972

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

973

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

974

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

975

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

976

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

977

DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

978

DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

979

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

980

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

981

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

982

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

983

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

984

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));

985

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));

986

DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));

987

DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

988

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

989

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));

990

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));

991

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));

992

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));

993

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));

994

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));

995

DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));

996

DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

997

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

998

DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));

999

DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));

1000

DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));

1001

DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));

1002

DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));

1003

DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));

1004

DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));

1005

DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1006

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1007

DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));

1008

DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));

1009

DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));

1010

DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));

1011

DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));

1012

DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));

1013

DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));

1014

DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1015

1016

// Compute the 8x4 intermediate tensor

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1017

VEC_DATA_TYPE(float, 4)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1018

comm_fact0, comm_fact1, comm_fact2;

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1019

VEC_DATA_TYPE(float, 4)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1020

tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1021

1022

COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);

1023

COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);

1024

COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);

1025

COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);

1026

COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);

1027

COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);

1028

COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);

1029

COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);

1030

1031

// Compute the 4x4 output tile

1032

comm_fact0 = tmp_col1 + tmp_col2;

1033

comm_fact1 = tmp_col3 + tmp_col4;

1034

comm_fact2 = tmp_col5 + tmp_col6;

1035

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1036

VEC_DATA_TYPE(float, 4)

1037

out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0;

1038

VEC_DATA_TYPE(float, 4)

1039

out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1040

1041

comm_fact0 = tmp_col1 - tmp_col2;

1042

comm_fact1 = tmp_col3 - tmp_col4;

1043

comm_fact2 = tmp_col5 - tmp_col6;

1044

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1045

VEC_DATA_TYPE(float, 4)

1046

out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2;

1047

VEC_DATA_TYPE(float, 4)

1048

out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1049

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1050

#if defined(HAS_BIAS)

1051

// Add bias

1052

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

1053

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1054

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1055

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1056

out_col0 += (VEC_DATA_TYPE(float, 4))b;

1057

out_col1 += (VEC_DATA_TYPE(float, 4))b;

1058

out_col2 += (VEC_DATA_TYPE(float, 4))b;

1059

out_col3 += (VEC_DATA_TYPE(float, 4))b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1060

#endif // defined(HAS_BIAS)

1061

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1062

// Store the output tile

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1063

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

1064

(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1065

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

1066

(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1067

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

1068

(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1069

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), A_VAL, B_VAL), 0,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

1070

(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1071

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1072

}

1073

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1074

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1075

*

1076

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1077

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1078

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

1079

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

1080

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1081

* @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1082

* @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1083

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1084

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1085

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1086

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1087

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1088

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1089

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1090

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1091

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1092

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1093

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1094

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1095

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1096

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1097

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1098

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1099

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1100

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1101

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1102

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1103

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1104

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1105

*/

1106

__kernel void winograd_output_transform_4x4_5x5_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1107

TENSOR4D_DECLARATION(src),

1108

TENSOR4D_DECLARATION(dst),

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1109

#if defined(HAS_BIAS)

1110

VECTOR_DECLARATION(bias),

1111

#endif // defined(HAS_BIAS)

1112

int dst_size)

1113

{

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1114

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1115

#if defined(SRC_DEPTH)

1116

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1117

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1118

#else /* defined(SRC_DEPTH) */

1119

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

1120

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

1121

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1122

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1123

int y_in = get_global_id(1);

1124

int x_out = get_global_id(0);

1125

int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

1126

int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1127

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1128

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1129

#endif /* defined(SRC_DEPTH) */

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1130

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

1131

__global unsigned char *dst_base_ptr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE);

1132

1133

#if defined(SRC_DEPTH)

1134

dst_base_ptr += batch * dst_stride_w;

1135

#endif // defined(SRC_DEPTH)

1136

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1137

// Load the values across the channels to compose the input tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1138

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

1139

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

1140

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

1141

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

1142

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

1143

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

1144

DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

1145

DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1146

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1147

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1148

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1149

float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;

1150

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;

1151

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;

1152

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1153

1154

#if defined(HAS_BIAS)

1155

// Add bias

1156

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

1157

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1158

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1159

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out00 += (float)b;

out01 += (float)b;

out02 += (float)b;

out03 += (float)b;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1164

#endif // defined(HAS_BIAS)

1165

1166

// Store the output tile

1167

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

1168

1169

dst_base_ptr += y_out * dst_stride_y;

1170

1171

int4 offset_z = min((int4)z_out + (int4)(0, 1, 2, 3), (int4)((int)DST_HEIGHT - 1)) * (int4)dst_stride_z;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1172

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1173

VEC_DATA_TYPE(DATA_TYPE, 4)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1174

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,

1175

B_VAL);

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

1176

1177

// To avoid the out-of-bound write, we store the elements in reverse order so the invalid element

1178

// is overwritten with the valid one

1179

*((__global DATA_TYPE *)(dst_base_ptr + offset_z.s3)) = out0_dt.s3;

1180

*((__global DATA_TYPE *)(dst_base_ptr + offset_z.s2)) = out0_dt.s2;

1181

*((__global DATA_TYPE *)(dst_base_ptr + offset_z.s1)) = out0_dt.s1;

1182

*((__global DATA_TYPE *)(dst_base_ptr + offset_z.s0)) = out0_dt.s0;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1183

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

1184

1185

dst_base_ptr += z_out * dst_stride_z;

1186

1187

int4 offset_y = min((int4)y_out + (int4)(0, 1, 2, 3), (int4)((int)DST_WIDTH - 1)) * (int4)dst_stride_y;

1188

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1189

VEC_DATA_TYPE(DATA_TYPE, 4)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1190

out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,

Usama Arif

2019-05-10 17:07:27 +0100

[diff] [blame]

1191

B_VAL);

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

1192

1193

// To avoid the out-of-bound write, we store the elements in reverse order so the invalid element

1194

// is overwritten with the valid one

1195

*((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3)) = out0_dt.s3;

1196

*((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2)) = out0_dt.s2;

1197

*((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1)) = out0_dt.s1;

1198

*((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0)) = out0_dt.s0;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1199

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1200

1201

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1202

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1203

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

1204

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

1205

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

1206

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

1207

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

1208

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

1209

DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

1210

DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1211

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1212

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

1213

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

1214

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

1215

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

1216

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

1217

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

1218

DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

1219

DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1220

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1221

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

1222

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

1223

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

1224

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

1225

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

1226

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

1227

DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

1228

DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1229

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1230

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

1231

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

1232

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

1233

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

1234

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));

1235

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));

1236

DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));

1237

DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1238

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1239

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));

1240

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));

1241

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));

1242

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));

1243

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));

1244

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));

1245

DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));

1246

DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1247

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1248

DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));

1249

DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));

1250

DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));

1251

DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));

1252

DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));

1253

DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));

1254

DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));

1255

DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1256

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1257

DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));

1258

DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));

1259

DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));

1260

DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));

1261

DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));

1262

DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));

1263

DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));

1264

DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1265

1266

// Compute the 8x4 intermediate tensor

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1267

VEC_DATA_TYPE(float, 4)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1268

comm_fact0, comm_fact1, comm_fact2;

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1269

VEC_DATA_TYPE(float, 4)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1270

tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1271

1272

COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);

1273

COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);

1274

COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);

1275

COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);

1276

COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);

1277

COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);

1278

COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);

1279

COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);

1280

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1281

// Compute the output tile

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1282

comm_fact0 = tmp_col1 + tmp_col2;

1283

comm_fact1 = tmp_col3 + tmp_col4;

1284

comm_fact2 = tmp_col5 + tmp_col6;

1285

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1286

VEC_DATA_TYPE(float, 4)

1287

out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;

1288

VEC_DATA_TYPE(float, 4)

1289

out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1290

1291

comm_fact0 = tmp_col1 - tmp_col2;

1292

comm_fact1 = tmp_col3 - tmp_col4;

1293

comm_fact2 = tmp_col5 - tmp_col6;

1294

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1295

VEC_DATA_TYPE(float, 4)

1296

out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;

1297

VEC_DATA_TYPE(float, 4)

1298

out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1299

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1300

#if defined(HAS_BIAS)

1301

// Add bias

1302

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

1303

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1304

DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1305

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1306

out_col0 += (VEC_DATA_TYPE(float, 4))b;

1307

out_col1 += (VEC_DATA_TYPE(float, 4))b;

1308

out_col2 += (VEC_DATA_TYPE(float, 4))b;

1309

out_col3 += (VEC_DATA_TYPE(float, 4))b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1310

#endif // defined(HAS_BIAS)

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

1311

1312

int4 offset_y = min((int4)y_out + (int4)(0, 1, 2, 3), (int4)((int)DST_WIDTH - 1)) * (int4)dst_stride_y;

1313

int4 offset_z = min((int4)z_out + (int4)(0, 1, 2, 3), (int4)((int)DST_HEIGHT - 1)) * (int4)dst_stride_z;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1314

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1315

// Store the output tile

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1316

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1317

out_col0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1318

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1319

out_col1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1320

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1321

out_col2_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col2, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1322

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1323

out_col3_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col3, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1324

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

1325

// To avoid the out-of-bound write, we store the elements in reverse order so the invalid element

1326

// is overwritten with the valid one

1327

*((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s3)) = out_col3_dt.s3;

1328

*((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s3)) = out_col2_dt.s3;

1329

*((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s3)) = out_col1_dt.s3;

1330

*((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s3)) = out_col0_dt.s3;

1331

*((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s2)) = out_col3_dt.s2;

1332

*((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s2)) = out_col2_dt.s2;

1333

*((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s2)) = out_col1_dt.s2;

1334

*((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s2)) = out_col0_dt.s2;

1335

*((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s1)) = out_col3_dt.s1;

1336

*((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s1)) = out_col2_dt.s1;

1337

*((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s1)) = out_col1_dt.s1;

1338

*((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s1)) = out_col0_dt.s1;

1339

*((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s0)) = out_col3_dt.s0;

1340

*((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s0)) = out_col2_dt.s0;

1341

*((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s0)) = out_col1_dt.s0;

1342

*((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s0)) = out_col0_dt.s0;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1343

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1344

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1345

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1346

1347

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1348

#if defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1349

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW

1350

*

1351

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1352

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

1353

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1354

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1355

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1356

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1357

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1358

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1359

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1360

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1361

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1362

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1363

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1364

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1365

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1366

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1367

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1368

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1369

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1370

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1371

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1372

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1373

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1374

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1375

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1376

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1377

*/

1378

__kernel void winograd_output_transform_2x1_3x1_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1379

TENSOR4D_DECLARATION(src),

1380

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1381

#if defined(HAS_BIAS)

1382

,

1383

VECTOR_DECLARATION(bias)

1384

#endif // defined(HAS_BIAS)

1385

)

1386

{

1387

winograd_output_transform_2x2_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1394

src_stride_w,

1395

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1396

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1404

dst_stride_w,

1405

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1406

dst_offset_first_element_in_bytes

1407

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1413

#endif // defined(HAS_BIAS)

1414

);

1415

}

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

1416

1417

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC

1418

*

1419

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1420

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

1421

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

1422

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

1423

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

1424

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1425

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

1426

*

1427

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

1428

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1429

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1430

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1431

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1432

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1433

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1434

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1435

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1436

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1437

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1438

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1439

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1440

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1441

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1442

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1443

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1444

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1445

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1446

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1447

*/

1448

__kernel void winograd_output_transform_2x1_7x1_nhwc(

1449

TENSOR4D_DECLARATION(src),

1450

TENSOR4D_DECLARATION(dst),

1451

#if defined(HAS_BIAS)

1452

VECTOR_DECLARATION(bias),

1453

#endif // defined(HAS_BIAS)

1454

int dst_size)

1455

{

1456

winograd_output_transform_2x2_7x7_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

1476

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1481

#endif // defined(HAS_BIAS)

1482

dst_size);

1483

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1484

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1485

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1486

#if defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1487

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW

1488

*

1489

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1490

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1491

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1492

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1493

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1494

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1495

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1496

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1497

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1498

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1499

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1500

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1501

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1502

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1503

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1504

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1505

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1506

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1507

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1508

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1509

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1510

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1511

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1512

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1513

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1514

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1515

*/

1516

__kernel void winograd_output_transform_4x1_3x1_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1517

TENSOR4D_DECLARATION(src),

1518

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1519

#if defined(HAS_BIAS)

1520

,

1521

VECTOR_DECLARATION(bias)

1522

#endif // defined(HAS_BIAS)

1523

)

1524

{

1525

winograd_output_transform_4x4_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1532

src_stride_w,

1533

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1534

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1542

dst_stride_w,

1543

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1544

dst_offset_first_element_in_bytes

1545

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1551

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW

1556

*

1557

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1558

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1559

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1560

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1561

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1562

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1563

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1564

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1565

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1566

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1567

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1568

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1569

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1570

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1571

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1572

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1573

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1574

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1575

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1576

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1577

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1578

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1579

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1580

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1581

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1582

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1583

*/

1584

__kernel void winograd_output_transform_4x1_5x1_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1585

TENSOR4D_DECLARATION(src),

1586

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1587

#if defined(HAS_BIAS)

1588

,

1589

VECTOR_DECLARATION(bias)

1590

#endif // defined(HAS_BIAS)

1591

)

1592

{

1593

winograd_output_transform_4x4_5x5_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1600

src_stride_w,

1601

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1602

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1610

dst_stride_w,

1611

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1612

dst_offset_first_element_in_bytes

1613

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1619

#endif // defined(HAS_BIAS)

1620

);

1621

}

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1622

1623

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC

1624

*

1625

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1626

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1627

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

1628

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

1629

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1630

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1631

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1632

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1633

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1634

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1635

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1636

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1637

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1638

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1639

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1640

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1641

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1642

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1643

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1644

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1645

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1646

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1647

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1648

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1649

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1650

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1651

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1652

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1653

*/

1654

__kernel void winograd_output_transform_4x1_3x1_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1655

TENSOR4D_DECLARATION(src),

1656

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1657

#if defined(HAS_BIAS)

1658

VECTOR_DECLARATION(bias),

1659

#endif // defined(HAS_BIAS)

1660

int dst_size)

1661

{

1662

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1669

src_stride_w,

1670

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1671

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1679

dst_stride_w,

1680

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1681

dst_offset_first_element_in_bytes,

1682

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1687

#endif // defined(HAS_BIAS)

dst_size);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC

1692

*

1693

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1694

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1695

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

1696

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

1697

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1698

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1699

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1700

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1701

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1702

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1703

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1704

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1705

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1706

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1707

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1708

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1709

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1710

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1711

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1712

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1713

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1714

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1715

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1716

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1717

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1718

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1719

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1720

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1721

*/

1722

__kernel void winograd_output_transform_4x1_5x1_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1723

TENSOR4D_DECLARATION(src),

1724

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1725

#if defined(HAS_BIAS)

1726

VECTOR_DECLARATION(bias),

1727

#endif // defined(HAS_BIAS)

1728

int dst_size)

1729

{

1730

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1737

src_stride_w,

1738

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1739

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1747

dst_stride_w,

1748

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1749

dst_offset_first_element_in_bytes,

1750

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1755

#endif // defined(HAS_BIAS)

1756

dst_size);

1757

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1758

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1759

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

1760

1761

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1762

#if defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1763

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW

1764

*

1765

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1766

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1767

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

1768

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1769

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1770

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1771

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1772

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1773

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1774

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1775

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1776

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1777

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1778

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1779

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1780

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1781

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1782

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1783

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1784

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1785

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1786

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1787

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1788

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1789

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1790

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1791

*/

1792

__kernel void winograd_output_transform_1x2_1x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1793

TENSOR4D_DECLARATION(src),

1794

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1795

#if defined(HAS_BIAS)

1796

,

1797

VECTOR_DECLARATION(bias)

1798

#endif // defined(HAS_BIAS)

1799

)

1800

{

1801

winograd_output_transform_2x2_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1808

src_stride_w,

1809

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1810

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1818

dst_stride_w,

1819

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1820

dst_offset_first_element_in_bytes

1821

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1827

#endif // defined(HAS_BIAS)

1828

);

1829

}

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

1830

1831

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC

1832

*

1833

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1834

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1835

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

1836

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

1837

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

1838

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

1839

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

1840

*

1841

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

1842

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1843

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1844

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1845

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1846

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1847

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1848

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1849

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1850

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1851

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1852

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1853

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1854

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1855

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1856

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1857

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1858

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1859

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1860

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1861

*/

1862

__kernel void winograd_output_transform_1x2_1x7_nhwc(

1863

TENSOR4D_DECLARATION(src),

1864

TENSOR4D_DECLARATION(dst),

1865

#if defined(HAS_BIAS)

1866

VECTOR_DECLARATION(bias),

1867

#endif // defined(HAS_BIAS)

1868

int dst_size)

1869

{

1870

winograd_output_transform_2x2_7x7_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

1890

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1895

#endif // defined(HAS_BIAS)

1896

dst_size);

1897

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1898

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1899

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1900

#if defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1901

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW

1902

*

1903

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1904

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1905

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1906

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1907

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1908

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1909

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1910

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1911

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1912

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1913

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1914

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1915

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1916

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1917

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1918

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1919

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1920

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1921

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1922

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1923

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1924

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1925

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1926

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1927

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1928

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1929

*/

1930

__kernel void winograd_output_transform_1x4_1x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1931

TENSOR4D_DECLARATION(src),

1932

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1933

#if defined(HAS_BIAS)

1934

,

1935

VECTOR_DECLARATION(bias)

1936

#endif // defined(HAS_BIAS)

1937

)

1938

{

1939

winograd_output_transform_4x4_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1946

src_stride_w,

1947

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1948

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1956

dst_stride_w,

1957

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1958

dst_offset_first_element_in_bytes

1959

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1965

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW

1970

*

1971

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1972

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1973

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1974

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1975

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1976

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1977

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1978

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1979

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1980

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1981

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1982

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1983

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1984

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1985

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1986

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1987

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1988

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1989

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1990

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1991

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1992

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1993

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1994

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1995

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1996

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1997

*/

1998

__kernel void winograd_output_transform_1x4_1x5_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1999

TENSOR4D_DECLARATION(src),

2000

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2001

#if defined(HAS_BIAS)

2002

,

2003

VECTOR_DECLARATION(bias)

2004

#endif // defined(HAS_BIAS)

2005

)

2006

{

2007

winograd_output_transform_4x4_5x5_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2014

src_stride_w,

2015

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2016

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2024

dst_stride_w,

2025

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2026

dst_offset_first_element_in_bytes

2027

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

2033

#endif // defined(HAS_BIAS)

2034

);

2035

}

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2036

2037

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC

2038

*

2039

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

2040

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

2041

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

2042

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

2043

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2044

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2045

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2046

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2047

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2048

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2049

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2050

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2051

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2052

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2053

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2054

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2055

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2056

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

2057

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

2058

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2059

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2060

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2061

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2062

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2063

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

2064

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

2065

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2066

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2067

*/

2068

__kernel void winograd_output_transform_1x4_1x3_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2069

TENSOR4D_DECLARATION(src),

2070

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2071

#if defined(HAS_BIAS)

2072

VECTOR_DECLARATION(bias),

2073

#endif // defined(HAS_BIAS)

2074

int dst_size)

2075

{

2076

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2083

src_stride_w,

2084

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2085

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2093

dst_stride_w,

2094

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2095

dst_offset_first_element_in_bytes,

2096

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

2101

#endif // defined(HAS_BIAS)

dst_size);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC

2106

*

2107

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

2108

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

2109

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

Gian Marco Iodice

2020-10-20 09:14:45 +0100

[diff] [blame]

2110

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

2111

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2112

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2113

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2114

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2115

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2116

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2117

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2118

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2119

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2120

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2121

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2122

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2123

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2124

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

2125

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

2126

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2127

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2128

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2129

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2130

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2131

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

2132

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

2133

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2134

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2135

*/

2136

__kernel void winograd_output_transform_1x4_1x5_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2137

TENSOR4D_DECLARATION(src),

2138

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2139

#if defined(HAS_BIAS)

2140

VECTOR_DECLARATION(bias),

2141

#endif // defined(HAS_BIAS)

2142

int dst_size)

2143

{

2144

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2151

src_stride_w,

2152

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2153

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2161

dst_stride_w,

2162

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2163

dst_offset_first_element_in_bytes,

2164

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

2169

#endif // defined(HAS_BIAS)

2170

dst_size);

2171

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

2172

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2173

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Georgios Pinitas