Blame - src/core/CL/cl_kernels/winograd_output_transform.cl - ml/ComputeLibrary

2018-08-31 16:10:16 +0100

[diff] [blame]

42

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

43

* @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu

44

* @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.

45

* @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4)

46

* @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=int

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

47

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

48

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

49

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

50

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

51

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

52

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

53

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

54

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

55

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

56

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

57

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

58

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

59

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

60

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

61

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

62

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

63

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

64

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

65

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

66

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

67

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

68

*/

69

__kernel void winograd_output_transform_2x2_3x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

70

TENSOR4D_DECLARATION(src),

71

TENSOR4D_DECLARATION(dst)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

72

#if defined(HAS_BIAS)

73

,

74

VECTOR_DECLARATION(bias)

75

#endif // defined(HAS_BIAS)

76

)

77

{

78

// Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

79

#if defined(SRC_DEPTH)

80

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

81

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

82

#else /* defined(SRC_DEPTH) */

83

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

84

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

85

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

86

87

// Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

88

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

89

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

90

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

91

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

92

93

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

94

// Compute the 2x1 or 1x2 output tile

95

// out00 = d00 + d01 + d02

96

// out01 = d01 - d02 - d03

97

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

98

float out00 = d00 + d01 + d02;

99

float out01 = d01 - d02 - d03;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

100

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

101

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

102

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

103

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

104

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

105

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

106

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

107

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

108

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

109

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

110

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

111

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

112

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

113

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

114

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

115

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

116

117

// Compute the 2x2 output tile

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

118

float k0 = d01 + d11 + d21;

119

float k1 = d02 + d12 + d22;

120

float k2 = d11 - d21 - d31;

121

float k3 = d12 - d22 - d32;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

122

123

// out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22

124

// out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)

125

// out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)

126

// out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)

127

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

float out00 = d10;

float out01 = -d13;

float out10 = d10;

float out11 = -d13;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

132

133

out00 += d00 + d20 + k0 + k1;

134

out01 += k0 - k1 - (d03 + d23);

135

out10 += -d20 - d30 + k2 + k3;

136

out11 += k2 - k3 + d23 + d33;

137

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

138

139

int y_in = get_global_id(1);

140

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

141

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

142

int z_out = get_global_id(0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

143

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

144

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

145

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

146

147

#if defined(HAS_BIAS)

148

// Add bias

149

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

150

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

151

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

152

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

153

out00 += (float)b;

154

out01 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

155

#endif // defined(HAS_BIAS)

156

157

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

158

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

159

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

160

#else /* defined(SRC_DEPTH) */

161

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

162

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

163

164

// Store the output tile

165

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

166

const const VEC_DATA_TYPE(DATA_TYPE, 2)

167

out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)));

168

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;

169

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

170

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

171

vstore2(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2))), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

172

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

173

174

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

175

#if defined(HAS_BIAS)

176

// Add bias

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

177

out10 += (DATA_TYPE)b;

178

out11 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

179

#endif // defined(HAS_BIAS)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

180

vstore2(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2))), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

181

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

182

}

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

183

184

#define COMPUTE_TMP_COL_2x2_7x7(col, d0, d1, d2, d3, d4, d5, d6, d7) \

185

({ \

186

col.s0 = d0 + d1 + d2 + d3 + d4 + d5 + d6; \

187

col.s1 = -d1 + d2 - 2 * d3 + 2 * d4 + -3 * d5 + 3 * d6 + d7; \

188

})

189

190

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC

191

*

192

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

193

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

194

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

195

* @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

196

* @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

197

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

198

*

199

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

200

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

201

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

202

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

203

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

204

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

205

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

206

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

207

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

208

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

209

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

210

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

211

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

212

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

213

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

214

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

215

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

216

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

217

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

218

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

219

*/

220

__kernel void winograd_output_transform_2x2_7x7_nhwc(

221

TENSOR4D_DECLARATION(src),

222

TENSOR4D_DECLARATION(dst),

223

#if defined(HAS_BIAS)

224

VECTOR_DECLARATION(bias),

225

#endif // defined(HAS_BIAS)

226

int dst_size)

227

{

228

// Each thread stores a 4x4/4x1 or 1x4 tile

229

#if defined(SRC_DEPTH)

230

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

231

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

232

#else /* defined(SRC_DEPTH) */

233

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

234

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

235

#endif /* defined(SRC_DEPTH) */

236

237

int y_in = get_global_id(1);

238

int x_out = get_global_id(0);

239

int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

240

int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

241

#if defined(SRC_DEPTH)

242

int batch = get_global_id(2) / SRC_DEPTH;

243

#endif /* defined(SRC_DEPTH) */

244

245

#if defined(SRC_DEPTH)

246

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

247

#else /* defined(SRC_DEPTH) */

248

249

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

250

#endif /* defined(SRC_DEPTH) */

251

252

// Load the values across the channels to compose the input tile

253

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

254

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

255

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

256

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

257

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

258

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

259

DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

260

DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

261

262

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

263

// Compute out00, out01, out02 and out03

264

float out00 = d00 + d01 + d02 + d03 + d04 + d05 + d06;

265

float out01 = -d01 + d02 - 2.f * d03 + 2.0f * d04 - 3.0f * d05 + 3.0f * d06 + d07;

266

267

#if defined(HAS_BIAS)

268

// Add bias

269

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

270

271

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

out00 += (float)b;

out01 += (float)b;

#endif // defined(HAS_BIAS)

276

277

// Store the output tile

278

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

279

// Get output address

280

#if defined(SRC_DEPTH)

281

int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

282

#else /* defined(SRC_DEPTH) */

283

int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

284

#endif /* defined(SRC_DEPTH) */

285

offset = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

286

287

VEC_DATA_TYPE(DATA_TYPE, 2)

288

out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)));

289

*(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;

290

*(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;

291

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

292

// Get output address

293

int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

294

VEC_DATA_TYPE(DATA_TYPE, 2)

295

out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)));

296

*(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;

297

*(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;

298

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

299

300

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

301

302

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

303

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

304

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

305

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

306

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

307

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

308

DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

309

DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

310

311

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

312

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

313

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

314

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

315

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

316

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

317

DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

318

DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

319

320

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

321

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

322

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

323

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

324

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

325

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

326

DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

327

DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

328

329

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

330

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

331

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

332

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

333

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));

334

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));

335

DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));

336

DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));

337

338

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));

339

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));

340

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));

341

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));

342

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));

343

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));

344

DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));

345

DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));

346

347

DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));

348

DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));

349

DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));

350

DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));

351

DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));

352

DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));

353

DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));

354

DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));

355

356

DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));

357

DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));

358

DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));

359

DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));

360

DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));

361

DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));

362

DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));

363

DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));

364

365

// Compute the 8x2 intermediate tensor

366

VEC_DATA_TYPE(float, 2)

367

tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;

368

369

COMPUTE_TMP_COL_2x2_7x7(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70);

370

COMPUTE_TMP_COL_2x2_7x7(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71);

371

COMPUTE_TMP_COL_2x2_7x7(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72);

372

COMPUTE_TMP_COL_2x2_7x7(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73);

373

COMPUTE_TMP_COL_2x2_7x7(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74);

374

COMPUTE_TMP_COL_2x2_7x7(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75);

375

COMPUTE_TMP_COL_2x2_7x7(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76);

376

COMPUTE_TMP_COL_2x2_7x7(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77);

377

378

// Compute the 2x2 output tile

379

VEC_DATA_TYPE(float, 2)

380

out_col0 = tmp_col0 + tmp_col1 + tmp_col2 + tmp_col3 + tmp_col4 + tmp_col5 + tmp_col6;

381

VEC_DATA_TYPE(float, 2)

382

out_col1 = -tmp_col1 + tmp_col2 - 2 * tmp_col3 + 2 * tmp_col4 - 3 * tmp_col5 + 3 * tmp_col6 + tmp_col7;

383

384

#if defined(HAS_BIAS)

385

// Add bias

386

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

387

388

DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

389

390

out_col0 += (VEC_DATA_TYPE(float, 2))b;

391

out_col1 += (VEC_DATA_TYPE(float, 2))b;

392

393

#endif // defined(HAS_BIAS)

394

// Get output address

395

#if defined(SRC_DEPTH)

396

int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

397

#else /* defined(SRC_DEPTH) */

398

int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

399

#endif /* defined(SRC_DEPTH) */

400

offset = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

401

int2 mult_y = min((int2)dst_size - offset, (int2)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.

402

403

// Store the output tile

404

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

405

out_col0_dt = ACTIVATION_FUNC(CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));

406

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

407

out_col1_dt = ACTIVATION_FUNC(CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));

408

409

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;

410

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;

411

412

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0_dt.s1;

413

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;

414

415

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

416

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

417

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

418

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

419

#if defined(VEC_SIZE) && VEC_SIZE == 4

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

420

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW

421

*

422

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

423

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

424

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

425

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

426

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

427

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

428

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

429

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

430

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

431

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

432

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

433

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

434

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

435

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

436

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

437

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

438

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

439

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

440

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

441

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

442

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

443

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

444

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

445

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

446

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

447

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

448

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

449

*/

450

__kernel void winograd_output_transform_4x4_3x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

451

TENSOR4D_DECLARATION(src),

452

TENSOR4D_DECLARATION(dst)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

453

#if defined(HAS_BIAS)

454

,

455

VECTOR_DECLARATION(bias)

456

#endif // defined(HAS_BIAS)

457

)

458

{

459

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

460

#if defined(SRC_DEPTH)

461

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

462

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

463

#else /* defined(SRC_DEPTH) */

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

464

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

465

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

466

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

467

468

// Load the values across the channels to compose the 6x6 or 6x1 tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

469

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

470

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

471

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

472

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

473

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

474

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

475

476

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

477

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

478

float out00 = d00 + d01 + d02 + d03 + d04;

479

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;

480

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;

481

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

482

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

483

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

484

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

485

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

486

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

487

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

488

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

489

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

490

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

491

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

492

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

493

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

494

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

495

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

496

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

497

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

498

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

499

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

500

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

501

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

502

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

503

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

504

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

505

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

506

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

507

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

508

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

509

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

510

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

511

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

512

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

513

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

514

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

515

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

516

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

517

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

518

519

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

520

float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;

521

float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;

522

float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;

523

float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

524

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

525

float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;

526

float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

527

528

out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;

529

out01 += k1 - d02 - d12 - d22 - d32 - d42;

530

out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;

531

out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;

532

533

// Compute out10, out11, out12 and out13

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

534

float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

535

float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

536

float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

537

float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

538

539

k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;

540

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;

541

542

out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;

543

out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;

544

out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;

545

out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;

546

547

// Compute out20, out21, out22 and out23

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

548

float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

549

float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

550

float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

551

float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

552

553

k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;

554

k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;

555

556

out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;

557

out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;

558

out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;

559

out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;

560

561

// Compute out30, out31, out32 and out33

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

562

float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

563

float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

564

float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

565

float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

566

567

k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;

568

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;

569

570

out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;

571

out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;

572

out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;

573

out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;

574

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

575

576

int y_in = get_global_id(1);

577

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

578

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

579

int z_out = get_global_id(0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

580

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

581

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

582

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

583

584

#if defined(HAS_BIAS)

585

// Add bias

586

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

587

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

588

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

589

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out00 += (float)b;

out01 += (float)b;

out02 += (float)b;

out03 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

594

#endif // defined(HAS_BIAS)

595

596

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

597

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

598

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

599

#else /* defined(SRC_DEPTH) */

600

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

601

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

602

603

// Store the output tile

604

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

605

VEC_DATA_TYPE(DATA_TYPE, 4)

606

out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));

607

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;

608

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;

609

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;

610

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

611

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

612

vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

613

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

614

615

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

616

#if defined(HAS_BIAS)

617

// Add bias

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out10 += (float)b;

out11 += (float)b;

out12 += (float)b;

out13 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

622

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out20 += (float)b;

out21 += (float)b;

out22 += (float)b;

out23 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

627

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out30 += (float)b;

out31 += (float)b;

out32 += (float)b;

out33 += (float)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

632

#endif // defined(HAS_BIAS)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

633

vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

634

vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));

635

vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

636

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

637

}

638

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

639

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

640

*

641

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

642

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

643

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

644

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

645

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

646

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

647

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

648

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

649

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

650

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

651

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

652

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

653

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

654

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

655

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

656

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

657

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

658

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

659

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

660

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

661

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

662

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

663

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

664

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

665

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

666

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

667

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

668

* @param[in] dst_size Size of the destination tensor, minus the last padding

669

*/

670

__kernel void winograd_output_transform_4x4_3x3_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

671

TENSOR4D_DECLARATION(src),

672

TENSOR4D_DECLARATION(dst),

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

673

#if defined(HAS_BIAS)

674

VECTOR_DECLARATION(bias),

675

#endif // defined(HAS_BIAS)

676

int dst_size)

677

{

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

678

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

679

#if defined(SRC_DEPTH)

680

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

681

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

682

#else /* defined(SRC_DEPTH) */

683

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

684

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

685

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

686

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

687

// Load the values across the 36 channels to compose the 6x6 or 6x1 tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

688

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

689

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

690

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

691

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

692

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

693

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

694

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

695

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

696

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

697

float out00 = d00 + d01 + d02 + d03 + d04;

698

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;

699

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;

700

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

701

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

702

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

703

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

704

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

705

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

706

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

707

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

708

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

709

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

710

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

711

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

712

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

713

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

714

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

715

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

716

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

717

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

718

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

719

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

720

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

721

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

722

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

723

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

724

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

725

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

726

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

727

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

728

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

729

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

730

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

731

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

732

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

733

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

734

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

735

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

736

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

737

738

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

739

float out00 = d01 + d21 + d41 + d11 + d31;

740

float out01 = d01 + d21 + d41 + d11 + d31;

741

float out02 = d01 + d21 + d41 + d11 + d31;

742

float out03 = d01 + d21 + d41 + d11 + d31;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

743

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

744

float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;

745

float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

746

747

out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;

748

out01 += k1 - d02 - d12 - d22 - d32 - d42;

749

out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;

750

out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;

751

752

// Compute out10, out11, out12 and out13

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

753

float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

754

float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

755

float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

756

float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

757

758

k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;

759

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;

760

761

out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;

762

out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;

763

out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;

764

out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;

765

766

// Compute out20, out21, out22 and out23

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

767

float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

768

float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

769

float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

770

float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

771

772

k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;

773

k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;

774

775

out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;

776

out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;

777

out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;

778

out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;

779

780

// Compute out30, out31, out32 and out33

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

781

float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

782

float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

783

float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

784

float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

785

786

k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;

787

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;

788

789

out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;

790

out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;

791

out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;

792

out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

793

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

794

795

int y_in = get_global_id(1);

796

int x_out = get_global_id(0);

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

797

int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

798

int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

799

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

800

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

801

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

802

803

#if defined(HAS_BIAS)

804

// Add bias

805

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

806

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

807

DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

808

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

809

out00 += (DATA_TYPE)b;

810

out01 += (DATA_TYPE)b;

811

out02 += (DATA_TYPE)b;

812

out03 += (DATA_TYPE)b;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

813

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

814

out10 += (DATA_TYPE)b;

815

out11 += (DATA_TYPE)b;

816

out12 += (DATA_TYPE)b;

817

out13 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

818

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

819

out20 += (DATA_TYPE)b;

820

out21 += (DATA_TYPE)b;

821

out22 += (DATA_TYPE)b;

822

out23 += (DATA_TYPE)b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

823

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

824

out30 += (DATA_TYPE)b;

825

out31 += (DATA_TYPE)b;

826

out32 += (DATA_TYPE)b;

827

out33 += (DATA_TYPE)b;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

828

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

829

830

#endif // defined(HAS_BIAS)

831

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

832

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

833

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

834

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

835

#else /* defined(SRC_DEPTH) */

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

836

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

837

#endif /* defined(SRC_DEPTH) */

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

838

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

839

840

// Store the 1x4 output tile

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

841

VEC_DATA_TYPE(DATA_TYPE, 4)

842

out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));

843

*((__global DATA_TYPE *)(dst_ptr + offset.s0)) = out0_dt.s0;

844

*((__global DATA_TYPE *)(dst_ptr + offset.s1)) = out0_dt.s1;

845

*((__global DATA_TYPE *)(dst_ptr + offset.s2)) = out0_dt.s2;

846

*((__global DATA_TYPE *)(dst_ptr + offset.s3)) = out0_dt.s3;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

847

#elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

848

// Store the 4x1 output tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

849

int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

Giorgio Arena

d02eb45

2018-07-18 11:45:30 +0100

[diff] [blame]

850

int mult_y = min(dst_size - offset, 1);

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

851

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

852

VEC_DATA_TYPE(DATA_TYPE, 4)

853

out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));

854

*((__global DATA_TYPE *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = out0_dt.s0;

855

*((__global DATA_TYPE *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = out0_dt.s1;

856

*((__global DATA_TYPE *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = out0_dt.s2;

857

*((__global DATA_TYPE *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = out0_dt.s3;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

858

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

859

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

860

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

861

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

862

#else /* defined(SRC_DEPTH) */

863

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

864

#endif /* defined(SRC_DEPTH) */

Georgios Pinitas

b0b3717

2018-07-20 17:30:56 +0100

[diff] [blame]

865

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

866

int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

867

868

// Store the 4x4 output tile

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

869

VEC_DATA_TYPE(DATA_TYPE, 4)

870

out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));

871

VEC_DATA_TYPE(DATA_TYPE, 4)

872

out1_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)));

873

VEC_DATA_TYPE(DATA_TYPE, 4)

874

out2_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)));

875

VEC_DATA_TYPE(DATA_TYPE, 4)

876

out3_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4)));

877

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out0_dt.s0;

878

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out0_dt.s1;

879

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = out0_dt.s2;

880

*((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = out0_dt.s3;

881

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = out1_dt.s0;

882

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = out1_dt.s1;

883

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = out1_dt.s2;

884

*((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = out1_dt.s3;

885

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = out2_dt.s0;

886

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = out2_dt.s1;

887

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = out2_dt.s2;

888

*((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = out2_dt.s3;

889

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = out3_dt.s0;

890

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out3_dt.s1;

891

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out3_dt.s2;

892

*((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out3_dt.s3;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

893

894

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

895

}

896

897

#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \

898

({ \

899

comm_fact.s0 = d1 + d2; \

900

comm_fact.s1 = d3 + d4; \

901

comm_fact.s2 = d5 + d6; \

902

\

903

col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \

904

col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \

905

\

906

comm_fact.s0 = d1 - d2; \

907

comm_fact.s1 = d3 - d4; \

908

comm_fact.s2 = d5 - d6; \

909

\

910

col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \

911

col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \

912

})

913

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

914

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

915

*

916

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

917

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

918

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

919

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

920

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

921

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

922

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

923

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

924

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

925

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

926

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

927

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

928

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

929

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

930

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

931

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

932

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

933

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

934

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

935

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

936

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

937

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

938

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

939

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

940

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

941

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

942

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

943

*/

944

__kernel void winograd_output_transform_4x4_5x5_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

945

TENSOR4D_DECLARATION(src),

946

TENSOR4D_DECLARATION(dst)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

947

#if defined(HAS_BIAS)

948

,

949

VECTOR_DECLARATION(bias)

950

#endif // defined(HAS_BIAS)

951

)

952

{

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

953

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

954

#if defined(SRC_DEPTH)

955

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

956

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

957

#else /* defined(SRC_DEPTH) */

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

958

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

959

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

960

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

961

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

962

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

963

// Compute output address

964

int y_in = get_global_id(1);

965

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

966

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

967

int z_out = get_global_id(0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

968

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

969

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

970

#endif /* defined(SRC_DEPTH) */

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

971

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

972

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

973

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

974

#else /* defined(SRC_DEPTH) */

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

975

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

976

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

977

#endif /* defined(SRC_DEPTH) */

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

978

979

// Load the values across the channels to compose the input tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

980

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

981

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

982

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

983

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

984

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

985

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

986

DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

987

DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

988

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

989

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

990

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

991

float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;

992

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;

993

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;

994

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

995

996

#if defined(HAS_BIAS)

997

// Add bias

998

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

999

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1000

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1001

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1002

out00 += (DATA_TYPE)b;

1003

out01 += (DATA_TYPE)b;

1004

out02 += (DATA_TYPE)b;

1005

out03 += (DATA_TYPE)b;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1006

#endif // defined(HAS_BIAS)

1007

1008

// Store the output tile

1009

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1010

VEC_DATA_TYPE(DATA_TYPE, 4)

1011

out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));

1012

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;

1013

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;

1014

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;

1015

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1016

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1017

vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1018

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1019

1020

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1021

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1022

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

1023

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

1024

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

1025

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

1026

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

1027

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

1028

DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

1029

DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1030

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1031

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

1032

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

1033

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

1034

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

1035

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

1036

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

1037

DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

1038

DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1039

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1040

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

1041

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

1042

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

1043

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

1044

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

1045

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

1046

DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

1047

DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1048

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1049

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

1050

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

1051

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

1052

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

1053

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));

1054

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));

1055

DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));

1056

DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1057

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1058

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));

1059

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));

1060

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));

1061

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));

1062

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));

1063

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));

1064

DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));

1065

DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1066

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1067

DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));

1068

DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));

1069

DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));

1070

DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));

1071

DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));

1072

DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));

1073

DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));

1074

DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1075

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1076

DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));

1077

DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));

1078

DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));

1079

DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));

1080

DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));

1081

DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));

1082

DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));

1083

DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1084

1085

// Compute the 8x4 intermediate tensor

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1086

VEC_DATA_TYPE(float, 4)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1087

comm_fact0, comm_fact1, comm_fact2;

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1088

VEC_DATA_TYPE(float, 4)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1089

tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1090

1091

COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);

1092

COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);

1093

COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);

1094

COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);

1095

COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);

1096

COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);

1097

COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);

1098

COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);

1099

1100

// Compute the 4x4 output tile

1101

comm_fact0 = tmp_col1 + tmp_col2;

1102

comm_fact1 = tmp_col3 + tmp_col4;

1103

comm_fact2 = tmp_col5 + tmp_col6;

1104

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1105

VEC_DATA_TYPE(float, 4)

1106

out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0;

1107

VEC_DATA_TYPE(float, 4)

1108

out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1109

1110

comm_fact0 = tmp_col1 - tmp_col2;

1111

comm_fact1 = tmp_col3 - tmp_col4;

1112

comm_fact2 = tmp_col5 - tmp_col6;

1113

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1114

VEC_DATA_TYPE(float, 4)

1115

out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2;

1116

VEC_DATA_TYPE(float, 4)

1117

out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1118

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1119

#if defined(HAS_BIAS)

1120

// Add bias

1121

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

1122

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1123

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1124

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1125

out_col0 += (VEC_DATA_TYPE(float, 4))b;

1126

out_col1 += (VEC_DATA_TYPE(float, 4))b;

1127

out_col2 += (VEC_DATA_TYPE(float, 4))b;

1128

out_col3 += (VEC_DATA_TYPE(float, 4))b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1129

#endif // defined(HAS_BIAS)

1130

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1131

// Store the output tile

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1132

vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0)), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

1133

vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1)), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

1134

vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2)), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));

1135

vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3)), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1136

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1137

}

1138

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1139

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1140

*

1141

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1142

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1143

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1144

* @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1145

* @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1146

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1147

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1148

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1149

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1150

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1151

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1152

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1153

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1154

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1155

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1156

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1157

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1158

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1159

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1160

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1161

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1162

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1163

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1164

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1165

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1166

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1167

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1168

*/

1169

__kernel void winograd_output_transform_4x4_5x5_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1170

TENSOR4D_DECLARATION(src),

1171

TENSOR4D_DECLARATION(dst),

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1172

#if defined(HAS_BIAS)

1173

VECTOR_DECLARATION(bias),

1174

#endif // defined(HAS_BIAS)

1175

int dst_size)

1176

{

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1177

// Each thread stores a 4x4/4x1 or 1x4 tile

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1178

#if defined(SRC_DEPTH)

1179

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1180

const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1181

#else /* defined(SRC_DEPTH) */

1182

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

1183

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

1184

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1185

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1186

int y_in = get_global_id(1);

1187

int x_out = get_global_id(0);

1188

int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

1189

int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1190

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1191

int batch = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1192

#endif /* defined(SRC_DEPTH) */

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1193

1194

// Load the values across the channels to compose the input tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1195

DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

1196

DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

1197

DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

1198

DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

1199

DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

1200

DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

1201

DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

1202

DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1203

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1204

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1205

// Compute out00, out01, out02 and out03

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1206

float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;

1207

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;

1208

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;

1209

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1210

1211

#if defined(HAS_BIAS)

1212

// Add bias

1213

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

1214

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1215

float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1216

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

out00 += (float)b;

out01 += (float)b;

out02 += (float)b;

out03 += (float)b;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1221

#endif // defined(HAS_BIAS)

1222

1223

// Store the output tile

1224

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1225

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1226

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1227

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1228

#else /* defined(SRC_DEPTH) */

1229

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

1230

#endif /* defined(SRC_DEPTH) */

1231

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1232

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1233

VEC_DATA_TYPE(DATA_TYPE, 4)

1234

out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));

1235

*(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;

1236

*(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;

1237

*(__global DATA_TYPE *)(dst_ptr + offset.s2) = out0_dt.s2;

1238

*(__global DATA_TYPE *)(dst_ptr + offset.s3) = out0_dt.s3;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1239

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1240

// Get output address

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1241

int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1242

VEC_DATA_TYPE(DATA_TYPE, 4)

1243

out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));

1244

*(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;

1245

*(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;

1246

*(__global DATA_TYPE *)(dst_ptr + 2 * dst_stride_y + offset) = out0_dt.s2;

1247

*(__global DATA_TYPE *)(dst_ptr + 3 * dst_stride_y + offset) = out0_dt.s3;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1248

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1249

1250

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1251

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1252

DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));

1253

DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));

1254

DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));

1255

DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));

1256

DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));

1257

DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));

1258

DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));

1259

DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1260

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1261

DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));

1262

DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));

1263

DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));

1264

DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));

1265

DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));

1266

DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));

1267

DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));

1268

DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1269

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1270

DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));

1271

DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));

1272

DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));

1273

DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));

1274

DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));

1275

DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));

1276

DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));

1277

DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1278

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1279

DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));

1280

DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));

1281

DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));

1282

DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));

1283

DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));

1284

DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));

1285

DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));

1286

DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1287

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1288

DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));

1289

DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));

1290

DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));

1291

DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));

1292

DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));

1293

DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));

1294

DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));

1295

DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1296

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1297

DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));

1298

DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));

1299

DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));

1300

DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));

1301

DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));

1302

DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));

1303

DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));

1304

DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1305

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1306

DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));

1307

DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));

1308

DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));

1309

DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));

1310

DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));

1311

DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));

1312

DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));

1313

DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1314

1315

// Compute the 8x4 intermediate tensor

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1316

VEC_DATA_TYPE(float, 4)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1317

comm_fact0, comm_fact1, comm_fact2;

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1318

VEC_DATA_TYPE(float, 4)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1319

tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1320

1321

COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);

1322

COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);

1323

COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);

1324

COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);

1325

COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);

1326

COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);

1327

COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);

1328

COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);

1329

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1330

// Compute the output tile

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1331

comm_fact0 = tmp_col1 + tmp_col2;

1332

comm_fact1 = tmp_col3 + tmp_col4;

1333

comm_fact2 = tmp_col5 + tmp_col6;

1334

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1335

VEC_DATA_TYPE(float, 4)

1336

out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;

1337

VEC_DATA_TYPE(float, 4)

1338

out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1339

1340

comm_fact0 = tmp_col1 - tmp_col2;

1341

comm_fact1 = tmp_col3 - tmp_col4;

1342

comm_fact2 = tmp_col5 - tmp_col6;

1343

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1344

VEC_DATA_TYPE(float, 4)

1345

out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;

1346

VEC_DATA_TYPE(float, 4)

1347

out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1348

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1349

#if defined(HAS_BIAS)

1350

// Add bias

1351

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

1352

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1353

DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1354

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

1355

out_col0 += (VEC_DATA_TYPE(float, 4))b;

1356

out_col1 += (VEC_DATA_TYPE(float, 4))b;

1357

out_col2 += (VEC_DATA_TYPE(float, 4))b;

1358

out_col3 += (VEC_DATA_TYPE(float, 4))b;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1359

#endif // defined(HAS_BIAS)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1360

// Get output address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1361

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1362

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1363

#else /* defined(SRC_DEPTH) */

1364

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);

1365

#endif /* defined(SRC_DEPTH) */

Georgios Pinitas

b0b3717

2018-07-20 17:30:56 +0100

[diff] [blame]

1366

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

1367

int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1368

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1369

// Store the output tile

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1370

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

1371

out_col0_dt = ACTIVATION_FUNC(CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));

1372

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

1373

out_col1_dt = ACTIVATION_FUNC(CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));

1374

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

1375

out_col2_dt = ACTIVATION_FUNC(CONVERT(out_col2, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));

1376

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

1377

out_col3_dt = ACTIVATION_FUNC(CONVERT(out_col3, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));

1378

1379

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;

1380

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;

1381

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * (int)dst_stride_y + offset.s0) = out_col2_dt.s0;

1382

*(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * (int)dst_stride_y + offset.s0) = out_col3_dt.s0;

1383

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0_dt.s1;

1384

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;

1385

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * (int)dst_stride_y + offset.s1) = out_col2_dt.s1;

1386

*(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * (int)dst_stride_y + offset.s1) = out_col3_dt.s1;

1387

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * (int)dst_stride_y + offset.s2) = out_col0_dt.s2;

1388

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * (int)dst_stride_y + offset.s2) = out_col1_dt.s2;

1389

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * (int)dst_stride_y + offset.s2) = out_col2_dt.s2;

1390

*(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * (int)dst_stride_y + offset.s2) = out_col3_dt.s2;

1391

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * (int)dst_stride_y + offset.s3) = out_col0_dt.s3;

1392

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * (int)dst_stride_y + offset.s3) = out_col1_dt.s3;

1393

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * (int)dst_stride_y + offset.s3) = out_col2_dt.s3;

1394

*(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * (int)dst_stride_y + offset.s3) = out_col3_dt.s3;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1395

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1396

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1397

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1398

1399

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1400

#if defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1401

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW

1402

*

1403

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1404

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

1405

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1406

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1407

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1408

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1409

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1410

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1411

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1412

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1413

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1414

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1415

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1416

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1417

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1418

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1419

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1420

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1421

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1422

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1423

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1424

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1425

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1426

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1427

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1428

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1429

*/

1430

__kernel void winograd_output_transform_2x1_3x1_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1431

TENSOR4D_DECLARATION(src),

1432

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1433

#if defined(HAS_BIAS)

1434

,

1435

VECTOR_DECLARATION(bias)

1436

#endif // defined(HAS_BIAS)

1437

)

1438

{

1439

winograd_output_transform_2x2_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1446

src_stride_w,

1447

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1448

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1456

dst_stride_w,

1457

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1458

dst_offset_first_element_in_bytes

1459

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1465

#endif // defined(HAS_BIAS)

1466

);

1467

}

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

1468

1469

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC

1470

*

1471

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1472

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

1473

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1474

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1475

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

1476

*

1477

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

1478

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1479

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1480

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1481

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1482

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1483

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1484

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1485

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1486

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1487

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1488

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1489

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1490

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1491

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1492

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1493

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1494

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1495

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1496

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1497

*/

1498

__kernel void winograd_output_transform_2x1_7x1_nhwc(

1499

TENSOR4D_DECLARATION(src),

1500

TENSOR4D_DECLARATION(dst),

1501

#if defined(HAS_BIAS)

1502

VECTOR_DECLARATION(bias),

1503

#endif // defined(HAS_BIAS)

1504

int dst_size)

1505

{

1506

winograd_output_transform_2x2_7x7_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

1526

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1531

#endif // defined(HAS_BIAS)

1532

dst_size);

1533

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1534

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1535

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1536

#if defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1537

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW

1538

*

1539

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1540

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1541

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1542

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1543

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1544

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1545

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1546

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1547

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1548

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1549

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1550

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1551

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1552

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1553

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1554

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1555

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1556

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1557

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1558

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1559

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1560

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1561

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1562

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1563

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1564

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1565

*/

1566

__kernel void winograd_output_transform_4x1_3x1_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1567

TENSOR4D_DECLARATION(src),

1568

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1569

#if defined(HAS_BIAS)

1570

,

1571

VECTOR_DECLARATION(bias)

1572

#endif // defined(HAS_BIAS)

1573

)

1574

{

1575

winograd_output_transform_4x4_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1582

src_stride_w,

1583

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1584

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1592

dst_stride_w,

1593

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1594

dst_offset_first_element_in_bytes

1595

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1601

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW

1606

*

1607

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1608

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1609

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1610

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1611

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1612

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1613

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1614

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1615

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1616

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1617

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1618

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1619

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1620

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1621

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1622

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1623

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1624

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1625

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1626

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1627

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1628

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1629

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1630

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1631

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1632

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1633

*/

1634

__kernel void winograd_output_transform_4x1_5x1_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1635

TENSOR4D_DECLARATION(src),

1636

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1637

#if defined(HAS_BIAS)

1638

,

1639

VECTOR_DECLARATION(bias)

1640

#endif // defined(HAS_BIAS)

1641

)

1642

{

1643

winograd_output_transform_4x4_5x5_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1650

src_stride_w,

1651

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1652

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1660

dst_stride_w,

1661

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1662

dst_offset_first_element_in_bytes

1663

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1669

#endif // defined(HAS_BIAS)

1670

);

1671

}

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1672

1673

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC

1674

*

1675

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1676

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1677

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1678

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1679

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1680

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1681

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1682

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1683

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1684

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1685

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1686

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1687

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1688

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1689

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1690

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1691

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1692

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1693

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1694

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1695

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1696

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1697

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1698

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1699

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1700

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1701

*/

1702

__kernel void winograd_output_transform_4x1_3x1_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1703

TENSOR4D_DECLARATION(src),

1704

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1705

#if defined(HAS_BIAS)

1706

VECTOR_DECLARATION(bias),

1707

#endif // defined(HAS_BIAS)

1708

int dst_size)

1709

{

1710

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1717

src_stride_w,

1718

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1719

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1727

dst_stride_w,

1728

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1729

dst_offset_first_element_in_bytes,

1730

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1735

#endif // defined(HAS_BIAS)

dst_size);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC

1740

*

1741

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1742

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1743

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1744

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1745

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1746

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1747

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1748

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1749

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1750

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1751

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1752

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1753

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1754

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1755

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1756

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1757

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1758

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1759

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1760

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1761

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1762

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1763

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1764

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1765

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1766

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1767

*/

1768

__kernel void winograd_output_transform_4x1_5x1_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1769

TENSOR4D_DECLARATION(src),

1770

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1771

#if defined(HAS_BIAS)

1772

VECTOR_DECLARATION(bias),

1773

#endif // defined(HAS_BIAS)

1774

int dst_size)

1775

{

1776

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1783

src_stride_w,

1784

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1785

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1793

dst_stride_w,

1794

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1795

dst_offset_first_element_in_bytes,

1796

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1801

#endif // defined(HAS_BIAS)

1802

dst_size);

1803

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1804

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1805

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

1806

1807

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1808

#if defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1809

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW

1810

*

1811

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1812

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1813

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

1814

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1815

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1816

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1817

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1818

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1819

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1820

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1821

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1822

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1823

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1824

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1825

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1826

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1827

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1828

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1829

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1830

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1831

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1832

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1833

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1834

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1835

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1836

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1837

*/

1838

__kernel void winograd_output_transform_1x2_1x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1839

TENSOR4D_DECLARATION(src),

1840

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1841

#if defined(HAS_BIAS)

1842

,

1843

VECTOR_DECLARATION(bias)

1844

#endif // defined(HAS_BIAS)

1845

)

1846

{

1847

winograd_output_transform_2x2_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1854

src_stride_w,

1855

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1856

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1864

dst_stride_w,

1865

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1866

dst_offset_first_element_in_bytes

1867

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1873

#endif // defined(HAS_BIAS)

1874

);

1875

}

giuros01

2019-04-01 12:07:02 +0100

[diff] [blame]

1876

1877

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC

1878

*

1879

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1880

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1881

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

1882

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

1883

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

1884

*

1885

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

1886

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1887

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1888

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1889

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1890

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1891

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1892

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1893

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1894

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1895

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1896

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1897

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1898

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1899

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1900

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1901

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1902

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1903

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1904

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1905

*/

1906

__kernel void winograd_output_transform_1x2_1x7_nhwc(

1907

TENSOR4D_DECLARATION(src),

1908

TENSOR4D_DECLARATION(dst),

1909

#if defined(HAS_BIAS)

1910

VECTOR_DECLARATION(bias),

1911

#endif // defined(HAS_BIAS)

1912

int dst_size)

1913

{

1914

winograd_output_transform_2x2_7x7_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

1934

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1939

#endif // defined(HAS_BIAS)

1940

dst_size);

1941

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1942

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1943

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

1944

#if defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1945

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW

1946

*

1947

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1948

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1949

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1950

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1951

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1952

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1953

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1954

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1955

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1956

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1957

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1958

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1959

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1960

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1961

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1962

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1963

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1964

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1965

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1966

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1967

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1968

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1969

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1970

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1971

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1972

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1973

*/

1974

__kernel void winograd_output_transform_1x4_1x3_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1975

TENSOR4D_DECLARATION(src),

1976

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1977

#if defined(HAS_BIAS)

1978

,

1979

VECTOR_DECLARATION(bias)

1980

#endif // defined(HAS_BIAS)

1981

)

1982

{

1983

winograd_output_transform_4x4_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1990

src_stride_w,

1991

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1992

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2000

dst_stride_w,

2001

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2002

dst_offset_first_element_in_bytes

2003

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

2009

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW

2014

*

2015

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

2016

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

2017

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

2018

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2019

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2020

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2021

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2022

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2023

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2024

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2025

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2026

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2027

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2028

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2029

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2030

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

2031

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

2032

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2033

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2034

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2035

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2036

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2037

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

2038

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

2039

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2040

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2041

*/

2042

__kernel void winograd_output_transform_1x4_1x5_nchw(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2043

TENSOR4D_DECLARATION(src),

2044

TENSOR4D_DECLARATION(dst)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2045

#if defined(HAS_BIAS)

2046

,

2047

VECTOR_DECLARATION(bias)

2048

#endif // defined(HAS_BIAS)

2049

)

2050

{

2051

winograd_output_transform_4x4_5x5_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2058

src_stride_w,

2059

src_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2060

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2068

dst_stride_w,

2069

dst_step_w,

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2070

dst_offset_first_element_in_bytes

2071

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

2077

#endif // defined(HAS_BIAS)

2078

);

2079

}

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2080

2081

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC

2082

*

2083

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

2084

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

2085

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

2086

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2087

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2088

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2089

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2090

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2091

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2092

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2093

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2094

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2095

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2096

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2097

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2098

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

2099

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

2100

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2101

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2102

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2103

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2104

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2105

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

2106

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

2107

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2108

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2109

*/

2110

__kernel void winograd_output_transform_1x4_1x3_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2111

TENSOR4D_DECLARATION(src),

2112

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2113

#if defined(HAS_BIAS)

2114

VECTOR_DECLARATION(bias),

2115

#endif // defined(HAS_BIAS)

2116

int dst_size)

2117

{

2118

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2125

src_stride_w,

2126

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2127

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2135

dst_stride_w,

2136

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2137

dst_offset_first_element_in_bytes,

2138

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

2143

#endif // defined(HAS_BIAS)

dst_size);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC

2148

*

2149

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

2150

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

2151

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

2152

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2153

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2154

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2155

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2156

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2157

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2158

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2159

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2160

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2161

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2162

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2163

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2164

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

2165

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

2166

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2167

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2168

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2169

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2170

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2171

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

2172

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

2173

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2174

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2175

*/

2176

__kernel void winograd_output_transform_1x4_1x5_nhwc(

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2177

TENSOR4D_DECLARATION(src),

2178

TENSOR4D_DECLARATION(dst),

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2179

#if defined(HAS_BIAS)

2180

VECTOR_DECLARATION(bias),

2181

#endif // defined(HAS_BIAS)

2182

int dst_size)

2183

{

2184

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2191

src_stride_w,

2192

src_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2193

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2201

dst_stride_w,

2202

dst_step_w,

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2203

dst_offset_first_element_in_bytes,

2204

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

2209

#endif // defined(HAS_BIAS)

2210

dst_size);

2211

}

Manuel Bottini

2018-10-02 16:41:52 +0100

[diff] [blame]

2212

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2213

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Georgios Pinitas