Blame - src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl - ml/ComputeLibrary

* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64

43

* @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time

44

* @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time

45

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

46

*

47

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

48

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

49

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

50

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

51

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

52

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

53

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

54

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

55

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

56

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

57

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

58

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

59

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

60

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

61

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

62

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

63

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

64

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

65

*/

66

__kernel void winograd_filter_transform_4x4_3x3_nhwc(

67

TENSOR4D_DECLARATION(src),

68

TENSOR3D_DECLARATION(dst))

69

{

70

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);

71

72

const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;

73

74

// Load the values from the input tensor

75

#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

76

DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

77

DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

78

DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

79

#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

80

DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 0 * src_stride_y));

81

DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 1 * src_stride_y));

82

DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 2 * src_stride_y));

83

#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)

84

DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));

85

DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));

86

DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));

87

DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));

88

DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));

89

DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));

90

#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)

91

#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

92

93

// Row 0

94

DATA_TYPE out00, out01, out02, out03, out04, out05;

95

out00 = (w00) / 16.f;

96

out01 = (-w00 - w01 - w02) / 24.f;

97

out02 = (-w00 + w01 - w02) / 24.f;

98

out03 = (w00 + 2.f * w01 + 4.f * w02) / 96.f;

99

out04 = (w00 - 2.f * w01 + 4.f * w02) / 96.f;

100

out05 = (w02) / 4.f;

101

102

#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

103

// Row 1

104

DATA_TYPE out10, out11, out12, out13, out14, out15;

105

out10 = (-w00 - w10 - w20) / 24.f;

106

out11 = (w00 + w10 + w20 + w01 + w11 + w21 + w02 + w12 + w22) / 36.f;

107

out12 = (w00 + w10 + w20 - w01 - w11 - w21 + w02 + w12 + w22) / 36.f;

108

out13 = (-w00 - w10 - w20 + 2.f * (-w01 - w11 - w21) + 4.f * (-w02 - w12 - w22)) / 144.f;

109

out14 = (-w00 - w10 - w20 + 2.f * (w01 + w11 + w21) + 4.f * (-w02 - w12 - w22)) / 144.f;

110

out15 = (-w02 - w12 - w22) / 6.f;

111

112

// Row 2

113

DATA_TYPE out20, out21, out22, out23, out24, out25;

114

out20 = (-w00 + w10 - w20) / 24.f;

115

out21 = (w00 - w10 + w20 + w01 - w11 + w21 + w02 - w12 + w22) / 36.f;

116

out22 = (w00 - w10 + w20 - w01 + w11 - w21 + w02 - w12 + w22) / 36.f;

117

out23 = (-w00 + w10 - w20 + 2.f * (-w01 + w11 - w21) + 4.f * (-w02 + w12 - w22)) / 144.f;

118

out24 = (-w00 + w10 - w20 + 2.f * (w01 - w11 + w21) + 4.f * (-w02 + w12 - w22)) / 144.f;

119

out25 = (-w02 + w12 - w22) / 6.f;

120

121

// Row 3

122

DATA_TYPE out30, out31, out32, out33, out34, out35;

123

out30 = (w00 + 2.f * w10 + 4.f * w20) / 96.f;

124

out31 = (-w00 - 2.f * w10 - 4.f * w20 - w01 - 2.f * w11 - 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f;

125

out32 = (-w00 - 2.f * w10 - 4.f * w20 + w01 + 2.f * w11 + 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f;

126

out33 = ((w00 + 2.f * w10 + 4.f * w20) + 2.f * (w01 + 2.f * w11 + 4.f * w21) + 4.f * (w02 + 2.f * w12 + 4.f * w22)) / 576.f;

127

out34 = ((w00 + 2.f * w10 + 4.f * w20) + 2.f * (-w01 - 2.f * w11 - 4.f * w21) + 4.f * (w02 + 2.f * w12 + 4.f * w22)) / 576.f;

128

out35 = (w02 + 2.f * w12 + 4.f * w22) / 24.f;

129

130

// Row 4

131

DATA_TYPE out40, out41, out42, out43, out44, out45;

132

out40 = (w00 - 2.f * w10 + 4.f * w20) / 96.f;

133

out41 = (-w00 + 2.f * w10 - 4.f * w20 - w01 + 2.f * w11 - 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f;

134

out42 = (-w00 + 2.f * w10 - 4.f * w20 + w01 - 2.f * w11 + 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f;

135

out43 = ((w00 - 2.f * w10 + 4.f * w20) + 2.f * (w01 - 2.f * w11 + 4.f * w21) + 4.f * (w02 - 2.f * w12 + 4.f * w22)) / 576.f;

136

out44 = ((w00 - 2.f * w10 + 4.f * w20) + 2.f * (-w01 + 2.f * w11 - 4.f * w21) + 4.f * (w02 - 2.f * w12 + 4.f * w22)) / 576.f;

137

out45 = (w02 - 2.f * w12 + 4.f * w22) / 24.f;

138

139

// Row 5

140

DATA_TYPE out50, out51, out52, out53, out54, out55;

141

out50 = (w20) / 4.f;

142

out51 = (-w20 - w21 - w22) / 6.f;

143

out52 = (-w20 + w21 - w22) / 6.f;

144

out53 = (w20 + 2.f * w21 + 4.f * w22) / 24.f;

145

out54 = (w20 - 2.f * w21 + 4.f * w22) / 24.f;

146

out55 = (w22);

147

#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

148

149

int x0 = get_global_id(2); // idx filter

150

int y0 = get_global_id(0); // idx channel

151

152

// Get output address

153

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;

154

155

// Store the values across the channels

156

// 36 channels for 3x3 kernels

157

// 6 channels for 3x1 or 1x3 kernels

158

*(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out00;

159

*(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out01;

160

*(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out02;

161

*(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out03;

162

*(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out04;

163

*(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out05;

164

#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

165

*(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out10;

166

*(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out11;

167

*(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out12;

168

*(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out13;

169

*(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out14;

170

*(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out15;

171

*(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out20;

172

*(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out21;

173

*(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out22;

174

*(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out23;

175

*(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out24;

176

*(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out25;

177

*(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out30;

178

*(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out31;

179

*(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out32;

180

*(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out33;

181

*(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out34;

182

*(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out35;

183

*(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out40;

184

*(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out41;

185

*(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out42;

186

*(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out43;

187

*(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out44;

188

*(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out45;

189

*(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out50;

190

*(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out51;

191

*(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out52;

192

*(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out53;

193

*(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out54;

194

*(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out55;

195

#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

196

}

197

198

/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NHWC and the output tile is 4x4/4x1 or 1x4

199

*

200

* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64

201

* @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time

202

* @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time

203

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

204

*

205

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

206

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

207

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

208

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

209

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

210

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

211

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

212

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

213

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

214

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

215

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

216

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

217

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

218

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

219

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

220

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

221

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

222

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

223

*/

224

__kernel void winograd_filter_transform_4x4_5x5_nhwc(

225

TENSOR4D_DECLARATION(src),

226

TENSOR3D_DECLARATION(dst))

227

{

228

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);

229

230

const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * sizeof(DATA_TYPE) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;

231

232

#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

233

// Load the values from the input tensor

234

DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

235

DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

236

DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

237

DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

238

DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

239

#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

240

// Load the values from the input tensor

241

DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y));

242

DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y));

243

DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y));

244

DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y));

245

DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));

246

#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

247

248

#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

249

DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));

250

DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));

251

DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));

252

DATA_TYPE w13 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 3 * src_stride_y));

253

DATA_TYPE w14 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 4 * src_stride_y));

254

DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));

255

DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));

256

DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));

257

DATA_TYPE w23 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 3 * src_stride_y));

258

DATA_TYPE w24 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 4 * src_stride_y));

259

DATA_TYPE w30 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 0 * src_stride_y));

260

DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 1 * src_stride_y));

261

DATA_TYPE w32 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 2 * src_stride_y));

262

DATA_TYPE w33 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 3 * src_stride_y));

263

DATA_TYPE w34 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 4 * src_stride_y));

264

DATA_TYPE w40 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 0 * src_stride_y));

265

DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 1 * src_stride_y));

266

DATA_TYPE w42 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 2 * src_stride_y));

267

DATA_TYPE w43 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 3 * src_stride_y));

268

DATA_TYPE w44 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 4 * src_stride_y));

269

#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

270

271

// Row 0

272

VEC_DATA_TYPE(DATA_TYPE, 8)

273

out0 = 0.0f;

274

out0.s0 = w00;

275

out0.s1 = -2.f * (w00 + w01 + w02 + w03 + w04) / 9.f;

276

out0.s2 = -2.f * (w00 - w01 + w02 - w03 + w04) / 9.f;

277

out0.s3 = (w00 + 2.f * w01 + 4.f * w02 + 8.f * w03 + 16.f * w04) / 90.f;

278

out0.s4 = (w00 - 2.f * w01 + 4.f * w02 - 8.f * w03 + 16.f * w04) / 90.f;

279

out0.s5 = (16.f * w00 + 8.f * w01 + 4.f * w02 + 2.f * w03 + w04) / 180.f;

280

out0.s6 = (16.f * w00 - 8.f * w01 + 4.f * w02 - 2.f * w03 + w04) / 180.f;

281

out0.s7 = w04;

282

283

#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

284

// Row 1

285

VEC_DATA_TYPE(DATA_TYPE, 8)

286

out1 = 0.0f;

287

out1.s0 = -2.f * (w00 + w10 + w20 + w30 + w40) / 9.f;

288

out1.s1 = 4.f * ((w00 + w10 + w20 + w30 + w40) + (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) + (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;

289

out1.s2 = 4.f * ((w00 + w10 + w20 + w30 + w40) - (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) - (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;

290

out1.s3 = -((w00 + w10 + w20 + w30 + w40) + 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *

291

(w04 + w14 + w24 + w34 + w44)) / 405.f;

292

out1.s4 = -((w00 + w10 + w20 + w30 + w40) - 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *

293

(w04 + w14 + w24 + w34 + w44)) / 405.f;

294

out1.s5 = -(16.f * (w00 + w10 + w20 + w30 + w40) + 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 2.f * (w03 + w13 + w23 + w33 + w43) +

295

(w04 + w14 + w24 + w34 + w44)) / 810.f;

296

out1.s6 = -(16.f * (w00 + w10 + w20 + w30 + w40) - 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 2.f * (w03 + w13 + w23 + w33 + w43) +

297

(w04 + w14 + w24 + w34 + w44)) / 810.f;

298

out1.s7 = -2.f * (w04 + w14 + w24 + w34 + w44) / 9.f;

299

300

// Row 2

301

VEC_DATA_TYPE(DATA_TYPE, 8)

302

out2 = 0.0f;

303

out2.s0 = -2.f * (w00 - w10 + w20 - w30 + w40) / 9.f;

304

out2.s1 = 4.f * ((w00 - w10 + w20 - w30 + w40) + (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) + (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;

305

out2.s2 = 4.f * ((w00 - w10 + w20 - w30 + w40) - (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) - (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;

306

out2.s3 = -((w00 - w10 + w20 - w30 + w40) + 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *

307

(w04 - w14 + w24 - w34 + w44)) / 405.f;

308

out2.s4 = -((w00 - w10 + w20 - w30 + w40) - 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *

309

(w04 - w14 + w24 - w34 + w44)) / 405.f;

310

out2.s5 = -(16.f * (w00 - w10 + w20 - w30 + w40) + 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 2.f * (w03 - w13 + w23 - w33 + w43) +

311

(w04 - w14 + w24 - w34 + w44)) / 810.f;

312

out2.s6 = -(16.f * (w00 - w10 + w20 - w30 + w40) - 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 2.f * (w03 - w13 + w23 - w33 + w43) +

313

(w04 - w14 + w24 - w34 + w44)) / 810.f;

314

out2.s7 = -2.f * (w04 - w14 + w24 - w34 + w44) / 9.f;

315

316

// Row 3

317

VEC_DATA_TYPE(DATA_TYPE, 8)

318

out3 = 0.0f;

319

out3.s0 = (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) / 90.f;

320

out3.s1 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) +

321

(w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;

322

out3.s2 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) -

323

(w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;

324

out3.s3 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 8.f

325

* (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;

326

out3.s4 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 8.f

327

* (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;

328

out3.s5 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *

329

(w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;

330

out3.s6 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *

331

(w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;

332

out3.s7 = (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44) / 90.f;

333

334

// Row 4

335

VEC_DATA_TYPE(DATA_TYPE, 8)

336

out4 = 0.0f;

337

out4.s0 = (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) / 90.f;

338

out4.s1 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) +

339

(w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;

340

out4.s2 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) -

341

(w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;

342

out4.s3 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 8.f

343

* (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;

344

out4.s4 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 8.f

345

* (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;

346

out4.s5 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *

347

(w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;

348

out4.s6 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *

349

(w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;

350

out4.s7 = (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44) / 90.f;

351

352

// Row 5

353

VEC_DATA_TYPE(DATA_TYPE, 8)

354

out5 = 0.0f;

355

out5.s0 = (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) / 180.f;

356

out5.s1 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) +

357

(16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;

358

out5.s2 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) -

359

(16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;

360

out5.s3 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 8.f

361

* (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;

362

out5.s4 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 8.f

363

* (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;

364

out5.s5 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *

365

(16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;

366

out5.s6 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *

367

(16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;

368

out5.s7 = (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44) / 180.f;

369

370

// Row 6

371

VEC_DATA_TYPE(DATA_TYPE, 8)

372

out6 = 0.0f;

373

out6.s0 = (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) / 180.f;

374

out6.s1 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) +

375

(16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;

376

out6.s2 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) -

377

(16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;

378

out6.s3 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 8.f

379

* (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;

380

out6.s4 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 8.f

381

* (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;

382

out6.s5 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *

383

(16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;

384

out6.s6 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *

385

(16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;

386

out6.s7 = (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44) / 180.f;

387

388

// Row 7

389

VEC_DATA_TYPE(DATA_TYPE, 8)

390

out7 = 0.0f;

391

out7.s0 = w40;

392

out7.s1 = -2.f * (w40 + w41 + w42 + w43 + w44) / 9.f;

393

out7.s2 = -2.f * (w40 - w41 + w42 - w43 + w44) / 9.f;

394

out7.s3 = (w40 + 2.f * w41 + 4.f * w42 + 8.f * w43 + 16.f * w44) / 90.f;

395

out7.s4 = (w40 - 2.f * w41 + 4.f * w42 - 8.f * w43 + 16.f * w44) / 90.f;

396

out7.s5 = (16.f * w40 + 8.f * w41 + 4.f * w42 + 2.f * w43 + w44) / 180.f;

397

out7.s6 = (16.f * w40 - 8.f * w41 + 4.f * w42 - 2.f * w43 + w44) / 180.f;

398

out7.s7 = w44;

399

#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

400

401

int x0 = get_global_id(2); // idx filter

402

int y0 = get_global_id(0); // idx channel

403

404

// Get output address

405

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;

406

407

// Store the values across the channels

408

*(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;

409

*(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;

410

*(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;

411

*(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;

412

*(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;

413

*(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;

414

*(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;

415

*(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;

416

417

#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

418

*(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s0;

419

*(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s1;

420

*(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;

421

*(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;

422

*(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;

423

*(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;

424

*(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;

425

*(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;

426

*(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;

427

*(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;

428

*(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;

429

*(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;

430

*(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;

431

*(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;

432

*(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;

433

*(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;

434

*(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;

435

*(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;

436

*(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;

437

*(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;

438

*(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;

439

*(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;

440

*(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;

441

*(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;

442

*(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;

443

*(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;

444

*(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;

445

*(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;

446

*(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;

447

*(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;

448

*(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;

449

*(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;

450

*(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;

451

*(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;

452

*(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;

453

*(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;

454

*(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;

455

*(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;

456

*(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;

457

*(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;

458

*(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;

459

*(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;

460

*(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;

461

*(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;

462

*(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;

463

*(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;

464

*(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;

465

*(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;

466

*(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;

467

*(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;

468

*(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;

469

*(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;

470

*(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;

471

*(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;

472

*(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;

473

*(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;

474

#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

475

}

476

/** This OpenCL kernel performs Winograd filter transform 7x7/7x1 or 1x7 when the data layout is NHWC and the output tile is 2x2/2x1 or 1x2

477

*

478

* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64

479

* @note If this kernel is used to perform Winograd filter transform 7x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time

480

* @note If this kernel is used to perform Winograd filter transform 1x7, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time

481

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

482

*

483

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

484

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

485

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

486

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

487

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

488

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

489

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

490

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

491

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

492

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

493

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

494

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

495

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

496

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

497

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

498

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

499

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

500

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

501

*/

502

__kernel void winograd_filter_transform_2x2_7x7_nhwc(

503

TENSOR4D_DECLARATION(src),

504

TENSOR3D_DECLARATION(dst))

505

{

506

Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);

507

508

const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * sizeof(DATA_TYPE) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;

509

510

#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

511

// Load the values from the input tensor

512

DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));

513

DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));

514

DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));

515

DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));

516

DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));

517

DATA_TYPE w05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));

518

DATA_TYPE w06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));

519

#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

520

// Load the values from the input tensor

521

DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y));

522

DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y));

523

DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y));

524

DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y));

525

DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));

526

DATA_TYPE w05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y));

527

DATA_TYPE w06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y));

528

#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

529

530

#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

531

DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));

532

DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));

533

DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));

534

DATA_TYPE w13 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 3 * src_stride_y));

535

DATA_TYPE w14 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 4 * src_stride_y));

536

DATA_TYPE w15 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 5 * src_stride_y));

537

DATA_TYPE w16 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 6 * src_stride_y));

538

539

DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));

540

DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));

541

DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));

542

DATA_TYPE w23 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 3 * src_stride_y));

543

DATA_TYPE w24 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 4 * src_stride_y));

544

DATA_TYPE w25 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 5 * src_stride_y));

545

DATA_TYPE w26 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 6 * src_stride_y));

546

547

DATA_TYPE w30 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 0 * src_stride_y));

548

DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 1 * src_stride_y));

549

DATA_TYPE w32 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 2 * src_stride_y));

550

DATA_TYPE w33 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 3 * src_stride_y));

551

DATA_TYPE w34 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 4 * src_stride_y));

552

DATA_TYPE w35 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 5 * src_stride_y));

553

DATA_TYPE w36 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 6 * src_stride_y));

554

555

DATA_TYPE w40 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 0 * src_stride_y));

556

DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 1 * src_stride_y));

557

DATA_TYPE w42 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 2 * src_stride_y));

558

DATA_TYPE w43 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 3 * src_stride_y));

559

DATA_TYPE w44 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 4 * src_stride_y));

560

DATA_TYPE w45 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 5 * src_stride_y));

561

DATA_TYPE w46 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 6 * src_stride_y));

562

563

DATA_TYPE w50 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 0 * src_stride_y));

564

DATA_TYPE w51 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 1 * src_stride_y));

565

DATA_TYPE w52 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 2 * src_stride_y));

566

DATA_TYPE w53 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 3 * src_stride_y));

567

DATA_TYPE w54 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 4 * src_stride_y));

568

DATA_TYPE w55 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 5 * src_stride_y));

569

DATA_TYPE w56 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 6 * src_stride_y));

570

571

DATA_TYPE w60 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 0 * src_stride_y));

572

DATA_TYPE w61 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 1 * src_stride_y));

573

DATA_TYPE w62 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 2 * src_stride_y));

574

DATA_TYPE w63 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 3 * src_stride_y));

575

DATA_TYPE w64 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 4 * src_stride_y));

576

DATA_TYPE w65 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 5 * src_stride_y));

577

DATA_TYPE w66 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 6 * src_stride_y));

578

579

#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

580

581

VEC_DATA_TYPE(DATA_TYPE, 8)

tmp = 0.0f;

// Row 0

VEC_DATA_TYPE(DATA_TYPE, 8)

586

out0 = 0.0f;

587

588

out0.s0 = -w00 / 36.0f;

589

out0.s1 = (w00 - w01 + w02 - w03 + w04 - w05 + w06) / 48.f;

590

out0.s2 = (w00 + w01 + w02 + w03 + w04 + w05 + w06) / 48.f;

591

out0.s3 = (-w00 + 2.f * w01 - 4.f * w02 + 8.f * w03 - 16.f * w04 + 32.f * w05 - 64.f * w06) / 120.f;

592

out0.s4 = (-w00 - 2.f * w01 - 4.f * w02 - 8.f * w03 - 16.f * w04 - 32.f * w05 - 64.f * w06) / 120.f;

593

out0.s5 = (w00 - 3.f * w01 + 9.f * w02 - 27.f * w03 + 81.f * w04 - 243.f * w05 + 729.f * w06) / 720.f;

594

out0.s6 = (w00 + 3.f * w01 + 9.f * w02 + 27.f * w03 + 81.f * w04 + 243.f * w05 + 729.f * w06) / 720.f;

595

out0.s7 = w06;

596

597

out0 /= (VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.f;

598

599

#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

600

601

// Row 1

602

VEC_DATA_TYPE(DATA_TYPE, 8)

603

out1 = 0.0f;

604

605

tmp.s0 = (w00 - w10 + w20 - w30 + w40 - w50 + w60) / 48.f;

606

tmp.s1 = (w01 - w11 + w21 - w31 + w41 - w51 + w61) / 48.f;

607

tmp.s2 = (w02 - w12 + w22 - w32 + w42 - w52 + w62) / 48.f;

608

tmp.s3 = (w03 - w13 + w23 - w33 + w43 - w53 + w63) / 48.f;

609

tmp.s4 = (w04 - w14 + w24 - w34 + w44 - w54 + w64) / 48.f;

610

tmp.s5 = (w05 - w15 + w25 - w35 + w45 - w55 + w65) / 48.f;

611

tmp.s6 = (w06 - w16 + w26 - w36 + w46 - w56 + w66) / 48.f;

612

613

OUTPUT_ROW_2x2_7x7(out1, tmp);

614

615

// Row 2

616

VEC_DATA_TYPE(DATA_TYPE, 8)

617

out2 = 0.0f;

618

619

tmp.s0 = (w00 + w10 + w20 + w30 + w40 + w50 + w60) / 48.f;

620

tmp.s1 = (w01 + w11 + w21 + w31 + w41 + w51 + w61) / 48.f;

621

tmp.s2 = (w02 + w12 + w22 + w32 + w42 + w52 + w62) / 48.f;

622

tmp.s3 = (w03 + w13 + w23 + w33 + w43 + w53 + w63) / 48.f;

623

tmp.s4 = (w04 + w14 + w24 + w34 + w44 + w54 + w64) / 48.f;

624

tmp.s5 = (w05 + w15 + w25 + w35 + w45 + w55 + w65) / 48.f;

625

tmp.s6 = (w06 + w16 + w26 + w36 + w46 + w56 + w66) / 48.f;

626

627

OUTPUT_ROW_2x2_7x7(out2, tmp);

628

629

// Row 3

630

VEC_DATA_TYPE(DATA_TYPE, 8)

631

out3 = 0.0f;

632

633

tmp.s0 = (-w00 + 2.f * w10 - 4.f * w20 + 8.f * w30 - 16.f * w40 + 32.f * w50 - 64.f * w60) / 120.f;

634

tmp.s1 = (-w01 + 2.f * w11 - 4.f * w21 + 8.f * w31 - 16.f * w41 + 32.f * w51 - 64.f * w61) / 120.f;

635

tmp.s2 = (-w02 + 2.f * w12 - 4.f * w22 + 8.f * w32 - 16.f * w42 + 32.f * w52 - 64.f * w62) / 120.f;

636

tmp.s3 = (-w03 + 2.f * w13 - 4.f * w23 + 8.f * w33 - 16.f * w43 + 32.f * w53 - 64.f * w63) / 120.f;

637

tmp.s4 = (-w04 + 2.f * w14 - 4.f * w24 + 8.f * w34 - 16.f * w44 + 32.f * w54 - 64.f * w64) / 120.f;

638

tmp.s5 = (-w05 + 2.f * w15 - 4.f * w25 + 8.f * w35 - 16.f * w45 + 32.f * w55 - 64.f * w65) / 120.f;

639

tmp.s6 = (-w06 + 2.f * w16 - 4.f * w26 + 8.f * w36 - 16.f * w46 + 32.f * w56 - 64.f * w66) / 120.f;

640

641

OUTPUT_ROW_2x2_7x7(out3, tmp);

642

643

// Row 4

644

VEC_DATA_TYPE(DATA_TYPE, 8)

645

out4 = 0.0f;

646

647

tmp.s0 = (-w00 - 2.f * w10 - 4.f * w20 - 8.f * w30 - 16.f * w40 - 32.f * w50 - 64.f * w60) / 120.f;

648

tmp.s1 = (-w01 - 2.f * w11 - 4.f * w21 - 8.f * w31 - 16.f * w41 - 32.f * w51 - 64.f * w61) / 120.f;

649

tmp.s2 = (-w02 - 2.f * w12 - 4.f * w22 - 8.f * w32 - 16.f * w42 - 32.f * w52 - 64.f * w62) / 120.f;

650

tmp.s3 = (-w03 - 2.f * w13 - 4.f * w23 - 8.f * w33 - 16.f * w43 - 32.f * w53 - 64.f * w63) / 120.f;

651

tmp.s4 = (-w04 - 2.f * w14 - 4.f * w24 - 8.f * w34 - 16.f * w44 - 32.f * w54 - 64.f * w64) / 120.f;

652

tmp.s5 = (-w05 - 2.f * w15 - 4.f * w25 - 8.f * w35 - 16.f * w45 - 32.f * w55 - 64.f * w65) / 120.f;

653

tmp.s6 = (-w06 - 2.f * w16 - 4.f * w26 - 8.f * w36 - 16.f * w46 - 32.f * w56 - 64.f * w66) / 120.f;

654

655

OUTPUT_ROW_2x2_7x7(out4, tmp);

656

657

// Row 5

658

VEC_DATA_TYPE(DATA_TYPE, 8)

659

out5 = 0.0f;

660

661

tmp.s0 = (w00 - 3.f * w10 + 9.f * w20 - 27.f * w30 + 81.f * w40 - 243.f * w50 + 729.f * w60) / 720.f;

662

tmp.s1 = (w01 - 3.f * w11 + 9.f * w21 - 27.f * w31 + 81.f * w41 - 243.f * w51 + 729.f * w61) / 720.f;

663

tmp.s2 = (w02 - 3.f * w12 + 9.f * w22 - 27.f * w32 + 81.f * w42 - 243.f * w52 + 729.f * w62) / 720.f;

664

tmp.s3 = (w03 - 3.f * w13 + 9.f * w23 - 27.f * w33 + 81.f * w43 - 243.f * w53 + 729.f * w63) / 720.f;

665

tmp.s4 = (w04 - 3.f * w14 + 9.f * w24 - 27.f * w34 + 81.f * w44 - 243.f * w54 + 729.f * w64) / 720.f;

666

tmp.s5 = (w05 - 3.f * w15 + 9.f * w25 - 27.f * w35 + 81.f * w45 - 243.f * w55 + 729.f * w65) / 720.f;

667

tmp.s6 = (w06 - 3.f * w16 + 9.f * w26 - 27.f * w36 + 81.f * w46 - 243.f * w56 + 729.f * w66) / 720.f;

668

669

OUTPUT_ROW_2x2_7x7(out5, tmp);

670

671

// Row 6

672

VEC_DATA_TYPE(DATA_TYPE, 8)

673

out6 = 0.0f;

674

675

tmp.s0 = (w00 + 3.f * w10 + 9.f * w20 + 27.f * w30 + 81.f * w40 + 243.f * w50 + 729.f * w60) / 720.f;

676

tmp.s1 = (w01 + 3.f * w11 + 9.f * w21 + 27.f * w31 + 81.f * w41 + 243.f * w51 + 729.f * w61) / 720.f;

677

tmp.s2 = (w02 + 3.f * w12 + 9.f * w22 + 27.f * w32 + 81.f * w42 + 243.f * w52 + 729.f * w62) / 720.f;

678

tmp.s3 = (w03 + 3.f * w13 + 9.f * w23 + 27.f * w33 + 81.f * w43 + 243.f * w53 + 729.f * w63) / 720.f;

679

tmp.s4 = (w04 + 3.f * w14 + 9.f * w24 + 27.f * w34 + 81.f * w44 + 243.f * w54 + 729.f * w64) / 720.f;

680

tmp.s5 = (w05 + 3.f * w15 + 9.f * w25 + 27.f * w35 + 81.f * w45 + 243.f * w55 + 729.f * w65) / 720.f;

681

tmp.s6 = (w06 + 3.f * w16 + 9.f * w26 + 27.f * w36 + 81.f * w46 + 243.f * w56 + 729.f * w66) / 720.f;

682

683

OUTPUT_ROW_2x2_7x7(out6, tmp);

684

685

// Row 7

686

VEC_DATA_TYPE(DATA_TYPE, 8)

out7 = 0.0f;

tmp.s0 = w60;

tmp.s1 = w61;

tmp.s2 = w62;

tmp.s3 = w63;

tmp.s4 = w64;

tmp.s5 = w65;

tmp.s6 = w66;

OUTPUT_ROW_2x2_7x7(out7, tmp);

698

699

#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

700

701

int x0 = get_global_id(2); // idx filter

702

int y0 = get_global_id(0); // idx channel

703

704

// Get output address

705

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;

706

707

// Store the values across the channels

708

*(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;

709

*(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;

710

*(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;

711

*(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;

712

*(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;

713

*(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;

714

*(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;

715

*(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;

716

717

#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

718

*(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s0;

719

*(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s1;

720

*(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;

721

*(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;

722

*(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;

723

*(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;

724

*(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;

725

*(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;

726

*(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;

727

*(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;

728

*(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;

729

*(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;

730

*(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;

731

*(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;

732

*(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;

733

*(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;

734

*(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;

735

*(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;

736

*(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;

737

*(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;

738

*(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;

739

*(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;

740

*(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;

741

*(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;

742

*(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;

743

*(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;

744

*(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;

745

*(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;

746

*(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;

747

*(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;

748

*(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;

749

*(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;

750

*(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;

751

*(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;

752

*(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;

753

*(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;

754

*(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;

755

*(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;

756

*(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;

757

*(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;

758

*(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;

759

*(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;

760

*(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;

761

*(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;

762

*(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;

763

*(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;

764

*(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;

765

*(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;

766

*(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;

767

*(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;

768

*(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;

769

*(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;

770

*(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;

771

*(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;

772

*(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;

773

*(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;

774

#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

775

}

776

#endif // defined(SRC_DIM_Z)

777

778

#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)

779

780

/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NHWC and the output tile is 4x1

781

*

782

* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64

783

* @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform

784

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

785

*

786

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

787

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

788

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

789

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

790

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

791

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

792

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

793

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

794

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

795

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

796

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

797

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

798

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

799

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

800

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

801

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

802

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

803

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

804

*/

805

__kernel void winograd_filter_transform_4x1_3x1_nhwc(

806

TENSOR4D_DECLARATION(src),

807

TENSOR3D_DECLARATION(dst))

808

{

809

winograd_filter_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes);

827

}

828

829

/** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NHWC and the output tile is 4x1

830

*

831

* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64

832

* @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform

833

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

834

*

835

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

836

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

837

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

838

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

839

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

840

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

841

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

842

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

843

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

844

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

845

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

846

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

847

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

848

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

849

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

850

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

851

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

852

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

853

*/

854

__kernel void winograd_filter_transform_4x1_5x1_nhwc(

855

TENSOR4D_DECLARATION(src),

856

TENSOR3D_DECLARATION(dst))

857

{

858

winograd_filter_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes);

876

}

877

878

/** This OpenCL kernel performs Winograd filter transform 7x1 when the data layout is NHWC and the output tile is 2x1

879

*

880

* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64

881

* @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform

882

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.

883

*

884

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

885

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

886

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

887

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

888

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

889

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

890

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

891

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

892

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

893

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

894

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

895

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

896

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

897

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

898

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

899

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

900

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

901

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

902

*/

903

__kernel void winograd_filter_transform_2x1_7x1_nhwc(

904

TENSOR4D_DECLARATION(src),

905

TENSOR3D_DECLARATION(dst))

906

{

907

winograd_filter_transform_2x2_7x7_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes);

925

}

926

#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)

927

928

#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)

929

/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NHWC and the output tile is 1x4

930

*

931

* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64

932

* @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform

933

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

934

*

935

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

936

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

937

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

938

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

939

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

940

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

941

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

942

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

943

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

944

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

945

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

946

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

947

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

948

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

949

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

950

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

951

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

952

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

953

*/

954

__kernel void winograd_filter_transform_1x4_1x3_nhwc(

955

TENSOR4D_DECLARATION(src),

956

TENSOR3D_DECLARATION(dst))

957

{

958

winograd_filter_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes);

976

}

977

978

/** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NHWC and the output tile is 1x4

979

*

980

* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64

981

* @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform

982

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

983

*

984

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

985

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

986

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

987

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

988

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

989

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

990

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

991

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

992

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

993

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

994

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

995

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

996

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

997

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

998

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

999

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1000

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1001

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1002

*/

1003

__kernel void winograd_filter_transform_1x4_1x5_nhwc(

1004

TENSOR4D_DECLARATION(src),

1005

TENSOR3D_DECLARATION(dst))

1006

{

1007

winograd_filter_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes);

1025

}

1026

1027

/** This OpenCL kernel performs Winograd filter transform 1x7 when the data layout is NHWC and the output tile is 1x2

1028

*

1029

* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64

1030

* @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform

1031

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.

1032

*

1033

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

1034

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1035

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1036

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1037

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1038

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1039

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1040

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1041

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1042

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1043

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1044

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1045

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1046

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1047

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1048

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1049

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1050

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1051

*/

1052

__kernel void winograd_filter_transform_1x2_1x7_nhwc(

1053

TENSOR4D_DECLARATION(src),

1054

TENSOR3D_DECLARATION(dst))

1055

{

1056

winograd_filter_transform_2x2_7x7_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes);

1074

}

1075

#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)