Blame - src/core/CL/cl_kernels/gemm.cl - ml/ComputeLibrary

2018-01-12 10:21:40 +0000

[diff] [blame]

42

*

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

43

* @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

44

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

45

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

46

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

47

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

48

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

49

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

50

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

51

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

52

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

53

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

54

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

55

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

56

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

57

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

58

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

59

*/

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

60

__kernel void gemm_transpose1xW(TENSOR3D_DECLARATION(src),

61

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

62

{

63

uint x = get_global_id(0);

64

uint y = get_global_id(1);

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

65

uint z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

66

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

67

// Compute address for Matrix B - source

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

68

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

69

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

70

// Compute address for Matrix B transposed - destination. X and Y are swapped

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

71

uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + y * TRANSPOSE_W * sizeof(DATA_TYPE) * MULT_TRANSPOSE1XW_WIDTH + (x / MULT_TRANSPOSE1XW_WIDTH) * dst_stride_y +

72

(x % MULT_TRANSPOSE1XW_WIDTH) * TRANSPOSE_W * sizeof(DATA_TYPE);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

73

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

74

// Add offset for batched GEMM

75

dst_addr_in_bytes += z * dst_stride_z;

76

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

77

VEC_DATA_TYPE(DATA_TYPE, TRANSPOSE_W)

78

b0 = VLOAD(TRANSPOSE_W)(0, (__global DATA_TYPE *)src.ptr);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

79

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

80

VSTORE(TRANSPOSE_W)

81

(b0, 0, (__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

82

}

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

83

#endif // defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

84

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

85

#if defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)

86

Gian Marco Iodice

4b90865

2018-10-18 10:21:02 +0100

[diff] [blame]

87

/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block. If -DUNROLL_BLOCK is passed at compile time, the 4x4 block

88

* will be simply unrolled.

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

89

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

90

* @note The data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=float)

91

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

92

* @note In case the input has to be reinterpreted as a 3D tensor (i.e. input of convolution layer 1x1), the following information must be passed at compile time:

93

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

94

* -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.

95

* -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor

96

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

97

*

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

98

* @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

99

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

100

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

101

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

102

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

103

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

104

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

105

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

106

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

107

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

108

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

109

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

110

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

111

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

112

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

113

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

114

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

115

*/

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

116

__kernel void gemm_interleave4x4(TENSOR3D_DECLARATION(src),

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

117

TENSOR3D_DECLARATION(dst)

118

#if defined(REINTERPRET_INPUT_AS_3D)

119

,

120

uint cross_plane_pad

121

#endif // REINTERPRET_INPUT_AS_3D

122

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

123

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

124

// Compute source and destination addresses

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

125

uint x = get_global_id(0);

126

uint y = get_global_id(1);

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

127

uint z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

128

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

129

// Compute address for source tensor

130

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

131

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

132

// Compute address for Matrix B transposed - destination. X and Y are swapped

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

133

uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * 16 * MULT_INTERLEAVE4X4_HEIGHT + (y / MULT_INTERLEAVE4X4_HEIGHT) * dst_stride_y +

134

(y % MULT_INTERLEAVE4X4_HEIGHT) * 4 * sizeof(DATA_TYPE);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

135

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

136

// Add offset for batched GEMM

137

dst_addr_in_bytes += z * dst_stride_z;

138

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

139

#if defined(REINTERPRET_INPUT_AS_3D)

140

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * 4 * sizeof(DATA_TYPE) + y * 4 * src_stride_y;

141

142

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

143

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

149

// |******************|

150

// | cross_plane_pad |

151

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

156

157

// The plane (zin) is calculated dividing M (y * 4) by HEIGHT_GEMM3D

158

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(y * 4)) / (uint4)HEIGHT_GEMM3D;

159

zin = min(DEPTH_GEMM3D - 1, zin);

160

161

// Add offset due to the cross plane paddings

162

zin *= (cross_plane_pad * src_stride_y);

163

164

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

165

// multiply src_stride_z by DEPTH_GEMM3D

166

input_ptr += z * src_stride_z * DEPTH_GEMM3D;

167

168

// Load values from Matrix A

169

VEC_DATA_TYPE(DATA_TYPE, 4)

170

a0 = vload4(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y + zin.s0));

171

VEC_DATA_TYPE(DATA_TYPE, 4)

172

a1 = vload4(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y + zin.s1));

173

VEC_DATA_TYPE(DATA_TYPE, 4)

174

a2 = vload4(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y + zin.s2));

175

VEC_DATA_TYPE(DATA_TYPE, 4)

176

a3 = vload4(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y + zin.s3));

177

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

178

__global uchar *input_ptr = src.ptr;

179

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

180

// Load values from Matrix A

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

181

VEC_DATA_TYPE(DATA_TYPE, 4)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

182

a0 = vload4(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

183

VEC_DATA_TYPE(DATA_TYPE, 4)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

184

a1 = vload4(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

185

VEC_DATA_TYPE(DATA_TYPE, 4)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

186

a2 = vload4(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

187

VEC_DATA_TYPE(DATA_TYPE, 4)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

188

a3 = vload4(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

189

#endif // defined(REINTERPRET_INPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

190

Gian Marco Iodice

4b90865

2018-10-18 10:21:02 +0100

[diff] [blame]

191

#if defined(UNROLL_BLOCK)

192

vstore4(a0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 0 * MULT_INTERLEAVE4X4_HEIGHT));

193

vstore4(a1, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 4 * MULT_INTERLEAVE4X4_HEIGHT));

194

vstore4(a2, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 8 * MULT_INTERLEAVE4X4_HEIGHT));

195

vstore4(a3, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 12 * MULT_INTERLEAVE4X4_HEIGHT));

196

#else // defined(UNROLL_BLOCK)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

197

VEC_DATA_TYPE(DATA_TYPE, 4)

198

val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s0, a1.s0, a2.s0, a3.s0);

199

vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 0 * MULT_INTERLEAVE4X4_HEIGHT));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

200

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

201

val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s1, a1.s1, a2.s1, a3.s1);

202

vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 4 * MULT_INTERLEAVE4X4_HEIGHT));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

203

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

204

val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s2, a1.s2, a2.s2, a3.s2);

205

vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 8 * MULT_INTERLEAVE4X4_HEIGHT));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

206

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

207

val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s3, a1.s3, a2.s3, a3.s3);

208

vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 12 * MULT_INTERLEAVE4X4_HEIGHT));

Gian Marco Iodice

4b90865

2018-10-18 10:21:02 +0100

[diff] [blame]

209

#endif // defined(UNROLL_BLOCK)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

210

}

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

211

#endif // defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

212

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

213

#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

214

/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

215

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

216

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

217

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

218

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)

219

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

220

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

221

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

222

*

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

223

* @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:

224

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

225

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

226

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

227

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

228

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

229

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

230

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

231

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

232

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

233

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

234

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

235

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

236

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

237

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

238

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

239

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

240

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

241

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

242

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

243

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

244

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

245

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

246

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

247

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

248

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

249

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

250

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

251

*/

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

252

__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),

253

IMAGE_DECLARATION(src1),

254

IMAGE_DECLARATION(dst),

255

uint src0_stride_z,

256

uint src1_stride_z,

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

257

uint dst_stride_z

258

#if defined(REINTERPRET_OUTPUT_AS_3D)

259

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

260

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

261

#endif // REINTERPRET_OUTPUT_AS_3D

262

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

263

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

264

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

265

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

266

int z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

267

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

268

// Offset

269

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

270

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

271

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

272

// src_addr_a = address of matrix A

273

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

274

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

275

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

276

277

#if defined(MATRIX_B_DEPTH)

278

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

279

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

280

#else // defined(MATRIX_B_DEPTH)

281

src1_addr_in_bytes += z * src1_stride_z;

282

#endif // defined(MATRIX_B_DEPTH)

283

284

__global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);

285

__global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

286

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

287

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

288

__global float *src_end_addr_b = src_addr_b + COLS_B;

289

290

src_addr_a += offset_row_a;

291

src_addr_b += offset_row_b;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

292

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

293

// Reset accumulators

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

float4 c00 = 0.0f;

float4 c10 = 0.0f;

float4 c20 = 0.0f;

float4 c30 = 0.0f;

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

299

for(; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

300

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

301

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

302

float4 a0 = vload4(0, src_addr_a);

303

float4 b0 = vload4(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

304

305

c00 += (float4)a0.s0 * b0;

306

c10 += (float4)a0.s1 * b0;

307

c20 += (float4)a0.s2 * b0;

308

c30 += (float4)a0.s3 * b0;

309

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

310

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

311

a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);

312

b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

313

314

c00 += (float4)a0.s0 * b0;

315

c10 += (float4)a0.s1 * b0;

316

c20 += (float4)a0.s2 * b0;

317

c30 += (float4)a0.s3 * b0;

318

}

319

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

320

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

321

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

322

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

323

float4 a0 = vload4(0, src_addr_a);

324

float4 b0 = vload4(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

325

326

c00 += (float4)a0.s0 * b0;

327

c10 += (float4)a0.s1 * b0;

328

c20 += (float4)a0.s2 * b0;

329

c30 += (float4)a0.s3 * b0;

330

}

331

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

332

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

333

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

334

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

335

#if defined(ALPHA)

336

// Multiply by the weight of matrix product

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

337

c00 = c00 * (float4)ALPHA;

338

c10 = c10 * (float4)ALPHA;

339

c20 = c20 * (float4)ALPHA;

340

c30 = c30 * (float4)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

341

#endif // defined(ALPHA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

342

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

343

// Compute dst address

344

__global uchar *dst_addr = offset(&dst, 0, 0);

345

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

346

#if defined(REINTERPRET_OUTPUT_AS_3D)

347

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

348

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

349

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

354

// |******************|

355

// | cross_plane_pad |

356

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

361

362

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

363

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

364

zout = min(DEPTH_GEMM3D - 1, zout);

365

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

366

// Add offset due to the cross plane paddings

367

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

368

369

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

370

// multiply dst_stride_z by DEPTH_GEMM3D

371

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

372

373

// Store 4x4 block

374

vstore4(c00, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

375

vstore4(c10, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

376

vstore4(c20, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

377

vstore4(c30, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

378

379

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

380

// Add offset for batched GEMM

381

dst_addr += z * dst_stride_z;

382

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

383

// Store 4x4 block

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

384

vstore4(c00, 0, (__global float *)(dst_addr + 0 * dst_stride_y));

385

vstore4(c10, 0, (__global float *)(dst_addr + 1 * dst_stride_y));

386

vstore4(c20, 0, (__global float *)(dst_addr + 2 * dst_stride_y));

387

vstore4(c30, 0, (__global float *)(dst_addr + 3 * dst_stride_y));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

388

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

389

}

390

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

391

/** This OpenCL kernel is optimized for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

392

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

393

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

394

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

395

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)

396

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

397

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

398

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

399

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

400

*

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

401

* @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:

402

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

403

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

404

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

405

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

406

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

407

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

408

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

409

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

410

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

411

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

412

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

413

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

414

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

415

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

416

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

417

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

418

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

419

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

420

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

421

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

422

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

423

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

424

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

425

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

426

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

427

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

428

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

429

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

430

__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),

431

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

432

IMAGE_DECLARATION(dst),

433

uint src0_stride_z,

434

uint src1_stride_z,

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

435

uint dst_stride_z

436

#if defined(REINTERPRET_OUTPUT_AS_3D)

437

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

438

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

439

#endif // REINTERPRET_OUTPUT_AS_3D

440

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

441

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

442

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

443

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

444

int z = get_global_id(2);

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

445

446

// Offset

447

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

448

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;

449

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

450

// src_addr_a = address of matrix A

451

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

452

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

453

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

454

455

#if defined(MATRIX_B_DEPTH)

456

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

457

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

458

#else // defined(MATRIX_B_DEPTH)

459

src1_addr_in_bytes += z * src1_stride_z;

460

#endif // defined(MATRIX_B_DEPTH)

461

462

__global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);

463

__global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

464

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

465

src_addr_a += offset_row_a;

466

src_addr_b += offset_row_b;

467

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

468

// Reset accumulators

float c00 = 0.0f;

float c01 = 0.0f;

float c02 = 0.0f;

float c03 = 0.0f;

float c10 = 0.0f;

float c11 = 0.0f;

float c12 = 0.0f;

float c13 = 0.0f;

float c20 = 0.0f;

float c21 = 0.0f;

float c22 = 0.0f;

float c23 = 0.0f;

float c30 = 0.0f;

float c31 = 0.0f;

float c32 = 0.0f;

float c33 = 0.0f;

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

486

#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH))

487

488

int i = 0;

489

for(; i <= (int)(COLS_MTX_B - 4); i += 4)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

490

{

491

// Load values from matrix A (interleaved) and matrix B (transposed)

492

float4 a0 = vload4(0, src_addr_a);

493

float4 b0 = vload4(0, src_addr_b);

494

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

495

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

496

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

497

498

c00 = fma(a0.s0, b0.s0, c00);

499

c01 = fma(a0.s0, b0.s1, c01);

500

c02 = fma(a0.s0, b0.s2, c02);

501

c03 = fma(a0.s0, b0.s3, c03);

502

503

c10 = fma(a0.s1, b0.s0, c10);

504

c11 = fma(a0.s1, b0.s1, c11);

505

c12 = fma(a0.s1, b0.s2, c12);

506

c13 = fma(a0.s1, b0.s3, c13);

507

508

c20 = fma(a0.s2, b0.s0, c20);

509

c21 = fma(a0.s2, b0.s1, c21);

510

c22 = fma(a0.s2, b0.s2, c22);

511

c23 = fma(a0.s2, b0.s3, c23);

512

513

c30 = fma(a0.s3, b0.s0, c30);

514

c31 = fma(a0.s3, b0.s1, c31);

515

c32 = fma(a0.s3, b0.s2, c32);

516

c33 = fma(a0.s3, b0.s3, c33);

517

518

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

519

a0 = vload4(0, src_addr_a);

520

b0 = vload4(0, src_addr_b);

521

522

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

523

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

524

525

c00 = fma(a0.s0, b0.s0, c00);

526

c01 = fma(a0.s0, b0.s1, c01);

527

c02 = fma(a0.s0, b0.s2, c02);

528

c03 = fma(a0.s0, b0.s3, c03);

529

530

c10 = fma(a0.s1, b0.s0, c10);

531

c11 = fma(a0.s1, b0.s1, c11);

532

c12 = fma(a0.s1, b0.s2, c12);

533

c13 = fma(a0.s1, b0.s3, c13);

534

535

c20 = fma(a0.s2, b0.s0, c20);

536

c21 = fma(a0.s2, b0.s1, c21);

537

c22 = fma(a0.s2, b0.s2, c22);

538

c23 = fma(a0.s2, b0.s3, c23);

539

540

c30 = fma(a0.s3, b0.s0, c30);

541

c31 = fma(a0.s3, b0.s1, c31);

542

c32 = fma(a0.s3, b0.s2, c32);

543

c33 = fma(a0.s3, b0.s3, c33);

544

545

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

546

a0 = vload4(0, src_addr_a);

547

b0 = vload4(0, src_addr_b);

548

549

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

550

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

551

552

c00 = fma(a0.s0, b0.s0, c00);

553

c01 = fma(a0.s0, b0.s1, c01);

554

c02 = fma(a0.s0, b0.s2, c02);

555

c03 = fma(a0.s0, b0.s3, c03);

556

557

c10 = fma(a0.s1, b0.s0, c10);

558

c11 = fma(a0.s1, b0.s1, c11);

559

c12 = fma(a0.s1, b0.s2, c12);

560

c13 = fma(a0.s1, b0.s3, c13);

561

562

c20 = fma(a0.s2, b0.s0, c20);

563

c21 = fma(a0.s2, b0.s1, c21);

564

c22 = fma(a0.s2, b0.s2, c22);

565

c23 = fma(a0.s2, b0.s3, c23);

566

567

c30 = fma(a0.s3, b0.s0, c30);

568

c31 = fma(a0.s3, b0.s1, c31);

569

c32 = fma(a0.s3, b0.s2, c32);

570

c33 = fma(a0.s3, b0.s3, c33);

571

572

// Load values from matrix A (interleaved) and matrix B (transposed)

573

a0 = vload4(0, src_addr_a);

574

b0 = vload4(0, src_addr_b);

575

576

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

577

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

578

579

c00 = fma(a0.s0, b0.s0, c00);

580

c01 = fma(a0.s0, b0.s1, c01);

581

c02 = fma(a0.s0, b0.s2, c02);

582

c03 = fma(a0.s0, b0.s3, c03);

583

584

c10 = fma(a0.s1, b0.s0, c10);

585

c11 = fma(a0.s1, b0.s1, c11);

586

c12 = fma(a0.s1, b0.s2, c12);

587

c13 = fma(a0.s1, b0.s3, c13);

588

589

c20 = fma(a0.s2, b0.s0, c20);

590

c21 = fma(a0.s2, b0.s1, c21);

591

c22 = fma(a0.s2, b0.s2, c22);

592

c23 = fma(a0.s2, b0.s3, c23);

593

594

c30 = fma(a0.s3, b0.s0, c30);

595

c31 = fma(a0.s3, b0.s1, c31);

596

c32 = fma(a0.s3, b0.s2, c32);

597

c33 = fma(a0.s3, b0.s3, c33);

598

}

599

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

600

for(; i < (int)(COLS_MTX_B); ++i)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

601

{

602

// Load values from matrix A (interleaved) and matrix B (transposed)

603

float4 a0 = vload4(0, src_addr_a);

604

float4 b0 = vload4(0, src_addr_b);

605

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

606

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

607

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

608

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

609

c00 = fma(a0.s0, b0.s0, c00);

610

c01 = fma(a0.s0, b0.s1, c01);

611

c02 = fma(a0.s0, b0.s2, c02);

612

c03 = fma(a0.s0, b0.s3, c03);

613

614

c10 = fma(a0.s1, b0.s0, c10);

615

c11 = fma(a0.s1, b0.s1, c11);

616

c12 = fma(a0.s1, b0.s2, c12);

617

c13 = fma(a0.s1, b0.s3, c13);

618

619

c20 = fma(a0.s2, b0.s0, c20);

620

c21 = fma(a0.s2, b0.s1, c21);

621

c22 = fma(a0.s2, b0.s2, c22);

622

c23 = fma(a0.s2, b0.s3, c23);

623

624

c30 = fma(a0.s3, b0.s0, c30);

625

c31 = fma(a0.s3, b0.s1, c31);

626

c32 = fma(a0.s3, b0.s2, c32);

627

c33 = fma(a0.s3, b0.s3, c33);

628

}

629

630

// Compute destination address

631

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

632

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

633

#if defined(ALPHA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

634

// Multiply by the weight of matrix product

c00 = c00 * ALPHA;

c01 = c01 * ALPHA;

c02 = c02 * ALPHA;

c03 = c03 * ALPHA;

c10 = c10 * ALPHA;

c11 = c11 * ALPHA;

c12 = c12 * ALPHA;

c13 = c13 * ALPHA;

c20 = c20 * ALPHA;

c21 = c21 * ALPHA;

c22 = c22 * ALPHA;

c23 = c23 * ALPHA;

c30 = c30 * ALPHA;

c31 = c31 * ALPHA;

c32 = c32 * ALPHA;

c33 = c33 * ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

651

#endif // defined(ALPHA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

652

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

653

// Compute dst address

654

__global uchar *dst_addr = offset(&dst, 0, 0);

655

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

656

#if defined(REINTERPRET_OUTPUT_AS_3D)

657

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

658

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

659

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

664

// |******************|

665

// | cross_plane_pad |

666

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

671

672

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

673

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

674

zout = min(DEPTH_GEMM3D - 1, zout);

675

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

676

// Add offset due to the cross plane paddings

677

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

678

679

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

680

// multiply dst_stride_z by DEPTH_GEMM3D

681

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

682

683

// Store 4x4 block

684

vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

685

vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

686

vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

687

vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

688

689

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

690

// Add offset for batched GEMM

691

dst_addr += z * dst_stride_z;

692

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

693

// Store 4x4 block

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

694

vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(dst_addr + 0 * dst_stride_y));

695

vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(dst_addr + 1 * dst_stride_y));

696

vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(dst_addr + 2 * dst_stride_y));

697

vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(dst_addr + 3 * dst_stride_y));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

698

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

699

}

700

Georgios Pinitas

8422558

2018-05-14 12:00:05 +0100

[diff] [blame]

701

// Undefine local defines

702

#undef COLS_MTX_B

703

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

704

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

705

/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

706

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

707

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

708

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

709

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)

710

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

711

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

712

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

713

*

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

714

* @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:

715

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

716

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

717

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

718

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

719

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

720

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

721

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

722

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

723

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

724

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

725

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

726

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

727

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

728

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

729

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

730

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

731

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

732

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

733

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

734

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

735

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

736

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

737

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

738

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

739

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

740

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

741

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

742

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

743

__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),

744

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

745

IMAGE_DECLARATION(dst),

746

uint src0_stride_z,

747

uint src1_stride_z,

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

748

uint dst_stride_z

749

#if defined(REINTERPRET_OUTPUT_AS_3D)

750

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

751

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

752

#endif // REINTERPRET_OUTPUT_AS_3D

753

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

754

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

755

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

756

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

757

int z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

758

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

759

// Offset

760

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

761

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

762

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

763

// src_addr_a = address of matrix A

764

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

765

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

766

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

767

768

#if defined(MATRIX_B_DEPTH)

769

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

770

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

771

#else // defined(MATRIX_B_DEPTH)

772

src1_addr_in_bytes += z * src1_stride_z;

773

#endif // defined(MATRIX_B_DEPTH)

774

775

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

776

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

777

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

778

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

779

__global half *src_end_addr_b = src_addr_b + COLS_B;

780

781

src_addr_a += offset_row_a;

782

src_addr_b += offset_row_b;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

783

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

784

// Reset accumulators

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

half8 c00 = 0.0f;

half8 c10 = 0.0f;

half8 c20 = 0.0f;

half8 c30 = 0.0f;

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

790

for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

791

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

792

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

793

half4 a0 = vload4(0, src_addr_a);

794

half8 b0 = vload8(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

795

796

c00 += (half8)a0.s0 * b0;

797

c10 += (half8)a0.s1 * b0;

798

c20 += (half8)a0.s2 * b0;

799

c30 += (half8)a0.s3 * b0;

800

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

801

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

802

a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);

803

b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

804

805

c00 += (half8)a0.s0 * b0;

806

c10 += (half8)a0.s1 * b0;

807

c20 += (half8)a0.s2 * b0;

808

c30 += (half8)a0.s3 * b0;

809

}

810

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

811

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

812

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

813

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

814

half4 a0 = vload4(0, src_addr_a);

815

half8 b0 = vload8(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

816

817

c00 += (half8)a0.s0 * b0;

818

c10 += (half8)a0.s1 * b0;

819

c20 += (half8)a0.s2 * b0;

820

c30 += (half8)a0.s3 * b0;

821

}

822

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

823

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

824

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

825

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

826

#if defined(ALPHA)

827

// Multiply by the weight of matrix product

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

828

c00 = c00 * (half8)ALPHA;

829

c10 = c10 * (half8)ALPHA;

830

c20 = c20 * (half8)ALPHA;

831

c30 = c30 * (half8)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

832

#endif // defined(ALPHA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

833

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

834

// Compute dst address

835

__global uchar *dst_addr = offset(&dst, 0, 0);

836

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

837

#if defined(REINTERPRET_OUTPUT_AS_3D)

838

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

839

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

840

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

845

// |******************|

846

// | cross_plane_pad |

847

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

852

853

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

854

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

855

zout = min(DEPTH_GEMM3D - 1, zout);

856

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

857

// Add offset due to the cross plane paddings

858

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

859

860

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

861

// multiply dst_stride_z by DEPTH_GEMM3D

862

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

863

864

// Store 4x8 block

865

vstore8(c00, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

866

vstore8(c10, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

867

vstore8(c20, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

868

vstore8(c30, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

869

870

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

871

// Add offset for batched GEMM

872

dst_addr += z * dst_stride_z;

873

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

874

// Store 4x8 block

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

875

vstore8(c00, 0, (__global half *)(dst_addr + 0 * dst_stride_y));

876

vstore8(c10, 0, (__global half *)(dst_addr + 1 * dst_stride_y));

877

vstore8(c20, 0, (__global half *)(dst_addr + 2 * dst_stride_y));

878

vstore8(c30, 0, (__global half *)(dst_addr + 3 * dst_stride_y));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

879

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

880

}

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

881

882

/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A (src0) and matrix B (src1)

883

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication

884

*

885

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

886

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)

887

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

888

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

889

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

890

*

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

891

* @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:

892

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

893

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

894

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

895

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

896

*

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

897

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

898

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

899

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

900

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

901

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

902

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

903

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

904

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

905

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

906

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

907

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

908

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

909

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

910

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

911

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

912

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

913

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

914

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

915

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

916

*/

917

__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),

918

IMAGE_DECLARATION(src1),

919

IMAGE_DECLARATION(dst),

920

uint src0_stride_z,

921

uint src1_stride_z,

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

922

uint dst_stride_z

923

#if defined(REINTERPRET_OUTPUT_AS_3D)

924

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

925

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

926

#endif // REINTERPRET_OUTPUT_AS_3D

927

)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

928

{

929

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

930

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

931

int z = get_global_id(2);

932

933

// Offset

934

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

935

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

936

937

// src_addr_a = address of matrix A

938

// src_addr_b = address of matrix B

939

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

940

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

941

942

#if defined(MATRIX_B_DEPTH)

943

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

944

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

945

#else // defined(MATRIX_B_DEPTH)

946

src1_addr_in_bytes += z * src1_stride_z;

947

#endif // defined(MATRIX_B_DEPTH)

948

949

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

950

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

951

952

// Compute end row address for matrix B

953

__global half *src_end_addr_b = src_addr_b + COLS_B;

954

955

src_addr_a += offset_row_a;

956

src_addr_b += offset_row_b;

957

958

// Reset accumulators

half8 c00 = 0.0f;

half8 c10 = 0.0f;

half8 c20 = 0.0f;

half8 c30 = 0.0f;

#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH))

965

966

int i = 0;

967

for(; i <= (int)(COLS_MTX_B - 4); i += 4)

968

{

969

#if MULT_INTERLEAVE4X4_HEIGHT == 1

970

// Load values from matrix A (interleaved) and matrix B (transposed)

971

half8 a0 = vload8(0, src_addr_a);

972

half8 b0 = vload8(0, src_addr_b);

973

974

src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;

975

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

976

977

c00 = fma((half8)a0.s0, b0, c00);

978

c10 = fma((half8)a0.s1, b0, c10);

979

c20 = fma((half8)a0.s2, b0, c20);

980

c30 = fma((half8)a0.s3, b0, c30);

981

982

// Load values from matrix B (transposed)

983

b0 = vload8(0, src_addr_b);

984

985

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

986

987

c00 = fma((half8)a0.s4, b0, c00);

988

c10 = fma((half8)a0.s5, b0, c10);

989

c20 = fma((half8)a0.s6, b0, c20);

990

c30 = fma((half8)a0.s7, b0, c30);

991

992

// Load values from matrix A (interleaved) and matrix B (transposed)

993

a0 = vload8(0, src_addr_a);

994

b0 = vload8(0, src_addr_b);

995

996

src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;

997

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

998

999

c00 = fma((half8)a0.s0, b0, c00);

1000

c10 = fma((half8)a0.s1, b0, c10);

1001

c20 = fma((half8)a0.s2, b0, c20);

1002

c30 = fma((half8)a0.s3, b0, c30);

1003

1004

// Load values from matrix B (transposed)

1005

b0 = vload8(0, src_addr_b);

1006

1007

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

1008

1009

c00 = fma((half8)a0.s4, b0, c00);

1010

c10 = fma((half8)a0.s5, b0, c10);

1011

c20 = fma((half8)a0.s6, b0, c20);

1012

c30 = fma((half8)a0.s7, b0, c30);

1013

#else // MULT_INTERLEAVE4X4_HEIGHT == 1

1014

// Load values from matrix A (interleaved) and matrix B (transposed)

1015

half4 a0 = vload4(0, src_addr_a);

1016

half8 b0 = vload8(0, src_addr_b);

1017

1018

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

1019

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

1020

1021

c00 = fma((half8)a0.s0, b0, c00);

1022

c10 = fma((half8)a0.s1, b0, c10);

1023

c20 = fma((half8)a0.s2, b0, c20);

1024

c30 = fma((half8)a0.s3, b0, c30);

1025

1026

// Load values from matrix A (interleaved) and matrix B (transposed)

1027

a0 = vload4(0, src_addr_a);

1028

b0 = vload8(0, src_addr_b);

1029

1030

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

1031

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

1032

1033

c00 = fma((half8)a0.s0, b0, c00);

1034

c10 = fma((half8)a0.s1, b0, c10);

1035

c20 = fma((half8)a0.s2, b0, c20);

1036

c30 = fma((half8)a0.s3, b0, c30);

1037

1038

// Load values from matrix A (interleaved) and matrix B (transposed)

1039

a0 = vload4(0, src_addr_a);

1040

b0 = vload8(0, src_addr_b);

1041

1042

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

1043

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

1044

1045

c00 = fma((half8)a0.s0, b0, c00);

1046

c10 = fma((half8)a0.s1, b0, c10);

1047

c20 = fma((half8)a0.s2, b0, c20);

1048

c30 = fma((half8)a0.s3, b0, c30);

1049

1050

// Load values from matrix A (interleaved) and matrix B (transposed)

1051

a0 = vload4(0, src_addr_a);

1052

b0 = vload8(0, src_addr_b);

1053

1054

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

1055

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

1056

1057

c00 = fma((half8)a0.s0, b0, c00);

1058

c10 = fma((half8)a0.s1, b0, c10);

1059

c20 = fma((half8)a0.s2, b0, c20);

1060

c30 = fma((half8)a0.s3, b0, c30);

1061

#endif // MULT_INTERLEAVE4X4_HEIGHT == 1

1062

}

1063

1064

for(; i < (int)(COLS_MTX_B); ++i)

1065

{

1066

// Load values from matrix A (interleaved) and matrix B (transposed)

1067

half4 a0 = vload4(0, src_addr_a);

1068

half8 b0 = vload8(0, src_addr_b);

1069

1070

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

1071

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

1072

1073

c00 = fma((half8)a0.s0, b0, c00);

1074

c10 = fma((half8)a0.s1, b0, c10);

1075

c20 = fma((half8)a0.s2, b0, c20);

1076

c30 = fma((half8)a0.s3, b0, c30);

1077

}

1078

1079

// Compute destination address

1080

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1081

1082

#if defined(ALPHA)

1083

// Multiply by the weight of matrix product

1084

c00 = c00 * (half8)ALPHA;

1085

c10 = c10 * (half8)ALPHA;

1086

c20 = c20 * (half8)ALPHA;

1087

c30 = c30 * (half8)ALPHA;

1088

#endif // defined(ALPHA)

1089

1090

// Compute dst address

1091

__global uchar *dst_addr = offset(&dst, 0, 0);

1092

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1093

#if defined(REINTERPRET_OUTPUT_AS_3D)

1094

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

1095

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1096

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

1101

// |******************|

1102

// | cross_plane_pad |

1103

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1108

1109

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

1110

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

1111

zout = min(DEPTH_GEMM3D - 1, zout);

1112

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

1113

// Add offset due to the cross plane paddings

1114

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1115

1116

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1117

// multiply dst_stride_z by DEPTH_GEMM3D

1118

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1119

1120

// Store 4x8 block

1121

vstore8(c00, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

1122

vstore8(c10, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

1123

vstore8(c20, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

1124

vstore8(c30, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

1125

1126

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1127

// Add offset for batched GEMM

1128

dst_addr += z * dst_stride_z;

1129

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

1130

// Store 4x8 block

1131

vstore8(c00, 0, (__global half *)(dst_addr + 0 * dst_stride_y));

1132

vstore8(c10, 0, (__global half *)(dst_addr + 1 * dst_stride_y));

1133

vstore8(c20, 0, (__global half *)(dst_addr + 2 * dst_stride_y));

1134

vstore8(c30, 0, (__global half *)(dst_addr + 3 * dst_stride_y));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1135

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

1136

}

Georgios Pinitas

8422558

2018-05-14 12:00:05 +0100

[diff] [blame]

1137

1138

// Undefine local defines

1139

#undef COLS_MTX_B

1140

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

1141

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1142

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

1143

#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1144

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1145

#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

1146

#if defined(DATA_TYPE)

1147

#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)

Michele Di Giorgio

f6f08da

2018-04-26 10:24:30 +0100

[diff] [blame]

1148

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1149

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1150

* @note This OpenCL kernel works with floating point data types (F16/F32)

1151

* @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

1152

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1153

* @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1154

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

1155

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1156

*

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1157

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1158

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1159

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1160

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1161

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1162

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

1163

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1164

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1165

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1166

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1167

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1168

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1169

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1170

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1171

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1172

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1173

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1174

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1175

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1176

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1177

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1178

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1179

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1180

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1181

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1182

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

1183

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

1184

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1185

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

1186

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1187

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1188

__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),

1189

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1190

IMAGE_DECLARATION(dst),

1191

uint src0_stride_z,

1192

uint src1_stride_z,

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1193

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1194

#if defined(REINTERPRET_INPUT_AS_3D)

1195

,

1196

uint src_cross_plane_pad

1197

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1198

#if defined(REINTERPRET_OUTPUT_AS_3D)

1199

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1200

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1201

#endif // REINTERPRET_OUTPUT_AS_3D

1202

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1203

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1204

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1205

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1206

// Compute starting address for matrix A and Matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1207

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1208

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1209

// Update address for the matrix A

1210

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1211

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1212

// Update address for the matrix B

1213

src_addr.s1 += idx * sizeof(DATA_TYPE);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1214

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1215

#if defined(REINTERPRET_INPUT_AS_3D)

1216

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

1217

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

1223

// |******************|

1224

// | cross_plane_pad |

1225

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

1230

1231

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

1232

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

1233

zin = min(DEPTH_GEMM3D - 1, zin);

1234

1235

// Add offset due to the cross plane paddings

1236

zin *= (src_cross_plane_pad * src0_stride_y);

1237

1238

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1239

// multiply src0_stride_z by DEPTH_GEMM3D

1240

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

1241

1242

#else // defined(REINTERPRET_INPUT_AS_3D)

1243

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1244

// Add offset for batched GEMM

1245

src_addr.s0 += get_global_id(2) * src0_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1246

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1247

#endif // defined(REINTERPRET_INPUT_AS_3D)

1248

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1249

#if defined(MATRIX_B_DEPTH)

1250

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1251

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

1252

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1253

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1254

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1255

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1256

int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));

1257

1258

VECTOR_TYPE acc0 = 0.0f;

1259

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1260

VECTOR_TYPE acc1 = 0.0f;

1261

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1262

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1263

VECTOR_TYPE acc2 = 0.0f;

1264

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1265

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1266

VECTOR_TYPE acc3 = 0.0f;

1267

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1268

Georgios Pinitas

96880cf

2017-10-20 18:52:20 +0100

[diff] [blame]

1269

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1270

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1271

#if defined(REINTERPRET_INPUT_AS_3D)

1272

// Load values from matrix A

1273

VEC_DATA_TYPE(DATA_TYPE, 2)

1274

a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

1275

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1276

VEC_DATA_TYPE(DATA_TYPE, 2)

1277

a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

1278

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1279

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1280

VEC_DATA_TYPE(DATA_TYPE, 2)

1281

a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

1282

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1283

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1284

VEC_DATA_TYPE(DATA_TYPE, 2)

1285

a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

1286

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1287

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1288

// Load values from matrix A

1289

VEC_DATA_TYPE(DATA_TYPE, 2)

1290

a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1291

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1292

VEC_DATA_TYPE(DATA_TYPE, 2)

1293

a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1294

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1295

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1296

VEC_DATA_TYPE(DATA_TYPE, 2)

1297

a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1298

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1299

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1300

VEC_DATA_TYPE(DATA_TYPE, 2)

1301

a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1302

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1303

#endif // defined(REINTERPRET_INPUT_AS_3D)

1304

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1305

// Load values from matrix B

1306

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

1307

VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1308

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1309

// Accumulate

1310

acc0 += b0 * (VECTOR_TYPE)a0.s0;

1311

acc0 += b1 * (VECTOR_TYPE)a0.s1;

1312

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1313

acc1 += b0 * (VECTOR_TYPE)a1.s0;

1314

acc1 += b1 * (VECTOR_TYPE)a1.s1;

1315

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1316

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1317

acc2 += b0 * (VECTOR_TYPE)a2.s0;

1318

acc2 += b1 * (VECTOR_TYPE)a2.s1;

1319

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1320

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1321

acc3 += b0 * (VECTOR_TYPE)a3.s0;

1322

acc3 += b1 * (VECTOR_TYPE)a3.s1;

1323

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1324

}

1325

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1326

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1327

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1328

#if defined(REINTERPRET_INPUT_AS_3D)

1329

// Load values from matrix A

1330

DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

1331

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1332

DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

1333

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1334

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1335

DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

1336

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1337

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1338

DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

1339

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1340

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1341

// Load values from matrix A

1342

DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1343

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1344

DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1345

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1346

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1347

DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1348

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1349

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1350

DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1351

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1352

#endif // defined(REINTERPRET_INPUT_AS_3D)

1353

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1354

// Load values from matrix B

1355

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1356

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1357

// Accumulate

1358

acc0 += b0 * (VECTOR_TYPE)a0;

1359

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1360

acc1 += b0 * (VECTOR_TYPE)a1;

1361

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1362

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1363

acc2 += b0 * (VECTOR_TYPE)a2;

1364

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1365

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1366

acc3 += b0 * (VECTOR_TYPE)a3;

1367

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1368

}

1369

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1370

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1371

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1372

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1373

// Compute dst address

1374

__global uchar *dst_addr = offset(&dst, 0, 0);

1375

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1376

// Multiply by the weight of matrix-matrix product and store the result

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1377

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1378

acc0 = acc0 * (VECTOR_TYPE)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1379

#endif // defined(ALPHA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1380

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 && defined(ALPHA)

1381

acc1 = acc1 * (VECTOR_TYPE)ALPHA;

1382

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 && defined(ALPHA)

1383

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 && defined(ALPHA)

1384

acc2 = acc2 * (VECTOR_TYPE)ALPHA;

1385

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 && defined(ALPHA)

1386

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 && defined(ALPHA)

1387

acc3 = acc3 * (VECTOR_TYPE)ALPHA;

1388

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 && defined(ALPHA)

1389

1390

int z = get_global_id(2);

1391

1392

#if defined(REINTERPRET_OUTPUT_AS_3D)

1393

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

1394

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1395

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

1400

// |******************|

1401

// | cross_plane_pad |

1402

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1407

1408

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

1409

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

1410

zout = min(DEPTH_GEMM3D - 1, zout);

1411

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

1412

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1413

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1414

1415

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1416

// multiply dst_stride_z by DEPTH_GEMM3D

1417

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1418

1419

// Store output block

1420

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

1421

(acc0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + zout.s0));

1422

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1423

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

1424

(acc1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + zout.s1));

1425

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1426

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1427

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

1428

(acc2, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y + zout.s2));

1429

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1430

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1431

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

1432

(acc3, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y + zout.s3));

1433

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1434

1435

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1436

// Add offset for batched GEMM

1437

dst_addr += z * dst_stride_z;

1438

1439

// Store output block

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1440

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1441

(acc0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1442

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1443

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1444

(acc1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1445

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1446

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1447

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1448

(acc2, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1449

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1450

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1451

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1452

(acc3, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1453

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1454

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1455

}

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1456

#endif // defined(DATA_TYPE)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1457

Michele Di Giorgio

f6f08da

2018-04-26 10:24:30 +0100

[diff] [blame]

1458

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1459

*

1460

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

1461

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

1462

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

1463

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

1464

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1465

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

1466

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1467

*

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1468

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1469

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1470

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1471

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1472

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1473

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

1474

*

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1475

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32

1476

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1477

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1478

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1479

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1480

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1481

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

1482

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1483

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1484

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1485

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1486

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1487

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

1488

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1489

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1490

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1491

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1492

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1493

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

1494

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

1495

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1496

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

1497

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1498

*/

1499

__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),

1500

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1501

IMAGE_DECLARATION(dst),

1502

uint src0_stride_z,

1503

uint src1_stride_z,

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1504

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1505

#if defined(REINTERPRET_INPUT_AS_3D)

1506

,

1507

uint src_cross_plane_pad

1508

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1509

#if defined(REINTERPRET_OUTPUT_AS_3D)

1510

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1511

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1512

#endif // REINTERPRET_OUTPUT_AS_3D

1513

)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1514

{

1515

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

1516

1517

// Compute starting address for matrix A and matrix B

1518

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

1519

1520

// Update address for matrix A

1521

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

1522

1523

// Update address for matrix B

1524

src_addr.s1 += idx * sizeof(float);

1525

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1526

#if defined(REINTERPRET_INPUT_AS_3D)

1527

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

1528

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

1534

// |******************|

1535

// | cross_plane_pad |

1536

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

1541

1542

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

1543

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

1544

zin = min(DEPTH_GEMM3D - 1, zin);

1545

1546

// Add offset due to the cross plane paddings

1547

zin *= (src_cross_plane_pad * src0_stride_y);

1548

1549

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1550

// multiply src0_stride_z by DEPTH_GEMM3D

1551

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

1552

1553

#else // defined(REINTERPRET_INPUT_AS_3D)

1554

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1555

// Add offset for batched GEMM

1556

src_addr.s0 += get_global_id(2) * src0_stride_z;

1557

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1558

#endif // defined(REINTERPRET_INPUT_AS_3D)

1559

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1560

#if defined(MATRIX_B_DEPTH)

1561

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1562

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

1563

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1564

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1565

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1566

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1567

// Initialize accumulators

float acc00 = 0.0f;

float acc01 = 0.0f;

float acc02 = 0.0f;

float acc03 = 0.0f;

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

float acc10 = 0.0f;

float acc11 = 0.0f;

float acc12 = 0.0f;

float acc13 = 0.0f;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1579

1580

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

float acc20 = 0.0f;

float acc21 = 0.0f;

float acc22 = 0.0f;

float acc23 = 0.0f;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1586

1587

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

float acc30 = 0.0f;

float acc31 = 0.0f;

float acc32 = 0.0f;

float acc33 = 0.0f;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1593

1594

// A and B src indices get incremented at the same time.

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1595

int i = 0;

1596

for(; i <= ((int)COLS_A - 4); i += 4)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1597

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1598

#if defined(REINTERPRET_INPUT_AS_3D)

1599

// Load values from matrix A and matrix B

1600

float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

1601

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1602

float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

1603

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1604

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1605

float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

1606

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1607

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1608

float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

1609

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1610

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1611

// Load values from matrix A and matrix B

1612

float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1613

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1614

float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1615

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1616

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1617

float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1618

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1619

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1620

float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1621

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1622

#endif // defined(REINTERPRET_INPUT_AS_3D)

1623

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1624

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

1625

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1626

1627

// Multiply and accumulate

1628

acc00 = fma(a0.s0, b0.s0, acc00);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1629

acc01 = fma(a0.s0, b0.s1, acc01);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1630

acc02 = fma(a0.s0, b0.s2, acc02);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1631

acc03 = fma(a0.s0, b0.s3, acc03);

1632

1633

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1634

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1635

acc10 = fma(a1.s0, b0.s0, acc10);

1636

acc11 = fma(a1.s0, b0.s1, acc11);

1637

acc12 = fma(a1.s0, b0.s2, acc12);

1638

acc13 = fma(a1.s0, b0.s3, acc13);

1639

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1640

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1641

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1642

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1643

acc20 = fma(a2.s0, b0.s0, acc20);

1644

acc21 = fma(a2.s0, b0.s1, acc21);

1645

acc22 = fma(a2.s0, b0.s2, acc22);

1646

acc23 = fma(a2.s0, b0.s3, acc23);

1647

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1648

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1649

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1650

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1651

acc30 = fma(a3.s0, b0.s0, acc30);

1652

acc31 = fma(a3.s0, b0.s1, acc31);

1653

acc32 = fma(a3.s0, b0.s2, acc32);

1654

acc33 = fma(a3.s0, b0.s3, acc33);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1655

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1656

1657

// Load values from matrix A and matrix B

1658

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

1659

src_addr.s1 += src1_stride_y;

1660

1661

// Multiply and accumulate

1662

acc00 = fma(a0.s1, b0.s0, acc00);

1663

acc01 = fma(a0.s1, b0.s1, acc01);

1664

acc02 = fma(a0.s1, b0.s2, acc02);

1665

acc03 = fma(a0.s1, b0.s3, acc03);

1666

1667

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1668

1669

acc10 = fma(a1.s1, b0.s0, acc10);

1670

acc11 = fma(a1.s1, b0.s1, acc11);

1671

acc12 = fma(a1.s1, b0.s2, acc12);

1672

acc13 = fma(a1.s1, b0.s3, acc13);

1673

1674

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1675

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1676

1677

acc20 = fma(a2.s1, b0.s0, acc20);

1678

acc21 = fma(a2.s1, b0.s1, acc21);

1679

acc22 = fma(a2.s1, b0.s2, acc22);

1680

acc23 = fma(a2.s1, b0.s3, acc23);

1681

1682

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1683

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1684

1685

acc30 = fma(a3.s1, b0.s0, acc30);

1686

acc31 = fma(a3.s1, b0.s1, acc31);

1687

acc32 = fma(a3.s1, b0.s2, acc32);

1688

acc33 = fma(a3.s1, b0.s3, acc33);

1689

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1690

1691

// Load values from matrix A and matrix B

1692

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

1693

src_addr.s1 += src1_stride_y;

1694

1695

// Multiply and accumulate

1696

acc00 = fma(a0.s2, b0.s0, acc00);

1697

acc01 = fma(a0.s2, b0.s1, acc01);

1698

acc02 = fma(a0.s2, b0.s2, acc02);

1699

acc03 = fma(a0.s2, b0.s3, acc03);

1700

1701

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1702

1703

acc10 = fma(a1.s2, b0.s0, acc10);

1704

acc11 = fma(a1.s2, b0.s1, acc11);

1705

acc12 = fma(a1.s2, b0.s2, acc12);

1706

acc13 = fma(a1.s2, b0.s3, acc13);

1707

1708

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1709

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1710

1711

acc20 = fma(a2.s2, b0.s0, acc20);

1712

acc21 = fma(a2.s2, b0.s1, acc21);

1713

acc22 = fma(a2.s2, b0.s2, acc22);

1714

acc23 = fma(a2.s2, b0.s3, acc23);

1715

1716

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1717

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1718

1719

acc30 = fma(a3.s2, b0.s0, acc30);

1720

acc31 = fma(a3.s2, b0.s1, acc31);

1721

acc32 = fma(a3.s2, b0.s2, acc32);

1722

acc33 = fma(a3.s2, b0.s3, acc33);

1723

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1724

1725

// Load values from matrix A and matrix B

1726

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

1727

src_addr.s1 += src1_stride_y;

1728

1729

// Multiply and accumulate

1730

acc00 = fma(a0.s3, b0.s0, acc00);

1731

acc01 = fma(a0.s3, b0.s1, acc01);

1732

acc02 = fma(a0.s3, b0.s2, acc02);

1733

acc03 = fma(a0.s3, b0.s3, acc03);

1734

1735

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1736

1737

acc10 = fma(a1.s3, b0.s0, acc10);

1738

acc11 = fma(a1.s3, b0.s1, acc11);

1739

acc12 = fma(a1.s3, b0.s2, acc12);

1740

acc13 = fma(a1.s3, b0.s3, acc13);

1741

1742

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1743

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1744

1745

acc20 = fma(a2.s3, b0.s0, acc20);

1746

acc21 = fma(a2.s3, b0.s1, acc21);

1747

acc22 = fma(a2.s3, b0.s2, acc22);

1748

acc23 = fma(a2.s3, b0.s3, acc23);

1749

1750

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1751

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1752

1753

acc30 = fma(a3.s3, b0.s0, acc30);

1754

acc31 = fma(a3.s3, b0.s1, acc31);

1755

acc32 = fma(a3.s3, b0.s2, acc32);

1756

acc33 = fma(a3.s3, b0.s3, acc33);

1757

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1758

1759

src_addr.s0 += 4 * sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1760

}

1761

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1762

for(; i < (int)COLS_A; ++i)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1763

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1764

#if defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1765

// Load values from matrix A

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1766

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

1767

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1768

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

1769

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1770

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1771

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

1772

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1773

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1774

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

1775

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1776

#else // defined(REINTERPRET_INPUT_AS_3D)

1777

// Load values from matrix A

1778

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1779

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1780

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1781

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1782

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1783

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1784

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1785

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1786

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1787

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1788

#endif // defined(REINTERPRET_INPUT_AS_3D)

1789

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1790

// Load values from matrix B

1791

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1792

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1793

1794

// Multiply and accumulate

1795

acc00 = fma(a0, b0.s0, acc00);

1796

acc01 = fma(a0, b0.s1, acc01);

1797

acc02 = fma(a0, b0.s2, acc02);

1798

acc03 = fma(a0, b0.s3, acc03);

1799

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1800

acc10 = fma(a1, b0.s0, acc10);

1801

acc11 = fma(a1, b0.s1, acc11);

1802

acc12 = fma(a1, b0.s2, acc12);

1803

acc13 = fma(a1, b0.s3, acc13);

1804

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1805

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1806

acc20 = fma(a2, b0.s0, acc20);

1807

acc21 = fma(a2, b0.s1, acc21);

1808

acc22 = fma(a2, b0.s2, acc22);

1809

acc23 = fma(a2, b0.s3, acc23);

1810

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1811

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1812

acc30 = fma(a3, b0.s0, acc30);

1813

acc31 = fma(a3, b0.s1, acc31);

1814

acc32 = fma(a3, b0.s2, acc32);

1815

acc33 = fma(a3, b0.s3, acc33);

1816

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1817

1818

src_addr.s0 += sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1819

}

1820

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1821

int z = get_global_id(2);

1822

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1823

// Compute destination address

1824

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1825

1826

// Multiply by the weight of matrix-matrix product and store the result

1827

#if defined(ALPHA)

1828

acc00 = acc00 * ALPHA;

1829

acc01 = acc01 * ALPHA;

1830

acc02 = acc02 * ALPHA;

1831

acc03 = acc03 * ALPHA;

1832

#endif // defined(ALPHA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1833

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 && defined(ALPHA)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1834

acc10 = acc10 * ALPHA;

1835

acc11 = acc11 * ALPHA;

1836

acc12 = acc12 * ALPHA;

1837

acc13 = acc13 * ALPHA;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1838

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 && defined(ALPHA)

1839

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 && defined(ALPHA)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1840

acc20 = acc20 * ALPHA;

1841

acc21 = acc21 * ALPHA;

1842

acc22 = acc22 * ALPHA;

1843

acc23 = acc23 * ALPHA;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1844

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 && defined(ALPHA)

1845

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 && defined(ALPHA)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1846

acc30 = acc30 * ALPHA;

1847

acc31 = acc31 * ALPHA;

1848

acc32 = acc32 * ALPHA;

1849

acc33 = acc33 * ALPHA;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1850

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 && defined(ALPHA)

1851

1852

// Compute dst address

1853

__global uchar *dst_addr = offset(&dst, 0, 0);

1854

1855

#if defined(REINTERPRET_OUTPUT_AS_3D)

1856

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

1857

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1858

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

1863

// |******************|

1864

// | cross_plane_pad |

1865

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1870

1871

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

1872

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

1873

zout = min(DEPTH_GEMM3D - 1, zout);

1874

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

1875

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1876

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1877

1878

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1879

// multiply dst_stride_z by DEPTH_GEMM3D

1880

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1881

1882

// Store the output block

1883

vstore4((float4)(acc00, acc01, acc02, acc03), 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

1884

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1885

vstore4((float4)(acc10, acc11, acc12, acc13), 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

1886

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1887

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1888

vstore4((float4)(acc20, acc21, acc22, acc23), 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

1889

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1890

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1891

vstore4((float4)(acc30, acc31, acc32, acc33), 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1892

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1893

1894

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1895

// Add offset for batched GEMM

1896

dst_addr += z * dst_stride_z;

1897

1898

// Store the output block

1899

vstore4((float4)(acc00, acc01, acc02, acc03), 0, (__global float *)(dst_addr + 0 * dst_stride_y));

1900

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1901

vstore4((float4)(acc10, acc11, acc12, acc13), 0, (__global float *)(dst_addr + 1 * dst_stride_y));

1902

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1903

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1904

vstore4((float4)(acc20, acc21, acc22, acc23), 0, (__global float *)(dst_addr + 2 * dst_stride_y));

1905

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1906

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1907

vstore4((float4)(acc30, acc31, acc32, acc33), 0, (__global float *)(dst_addr + 3 * dst_stride_y));

1908

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1909

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1910

}

1911

1912

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

1913

*

1914

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

1915

* This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.

1916

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

1917

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.

1918

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

1919

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1920

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

1921

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1922

*

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1923

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1924

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1925

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1926

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1927

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1928

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

1929

*

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1930

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32

1931

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1932

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1933

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1934

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1935

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1936

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

1937

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1938

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1939

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1940

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1941

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1942

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

1943

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1944

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1945

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1946

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1947

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1948

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

1949

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

1950

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1951

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

1952

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1953

*/

1954

__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),

1955

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1956

IMAGE_DECLARATION(dst),

1957

uint src0_stride_z,

1958

uint src1_stride_z,

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1959

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1960

#if defined(REINTERPRET_INPUT_AS_3D)

1961

,

1962

uint src_cross_plane_pad

1963

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1964

#if defined(REINTERPRET_OUTPUT_AS_3D)

1965

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1966

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1967

#endif // REINTERPRET_OUTPUT_AS_3D

1968

)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1969

{

1970

// Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1971

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

1972

1973

// Compute starting address for matrix A and Matrix B

1974

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

1975

1976

// Update address for the matrix A

1977

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

1978

1979

// Update address for the matrix B

1980

src_addr.s1 += idx * sizeof(float);

1981

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1982

#if defined(REINTERPRET_INPUT_AS_3D)

1983

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

1984

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

1990

// |******************|

1991

// | cross_plane_pad |

1992

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

1997

1998

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

1999

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

2000

zin = min(DEPTH_GEMM3D - 1, zin);

2001

2002

// Add offset due to the cross plane paddings

2003

zin *= (src_cross_plane_pad * src0_stride_y);

2004

2005

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2006

// multiply src0_stride_z by DEPTH_GEMM3D

2007

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

2008

2009

#else // defined(REINTERPRET_INPUT_AS_3D)

2010

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2011

// Add offset for batched GEMM

2012

src_addr.s0 += get_global_id(2) * src0_stride_z;

2013

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2014

#endif // defined(REINTERPRET_INPUT_AS_3D)

2015

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

2016

#if defined(MATRIX_B_DEPTH)

2017

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2018

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

2019

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2020

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

2021

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2022

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2023

// Initialize accumulators

float acc00 = 0.0f;

float acc01 = 0.0f;

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2028

float acc10 = 0.0f;

2029

float acc11 = 0.0f;

2030

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2031

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2032

float acc20 = 0.0f;

2033

float acc21 = 0.0f;

2034

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2035

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2036

float acc30 = 0.0f;

2037

float acc31 = 0.0f;

2038

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2039

2040

// A and B src indices get incremented at the same time.

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2041

int i = 0;

2042

for(; i <= ((int)COLS_A - 8); i += 8)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2043

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2044

#if defined(REINTERPRET_INPUT_AS_3D)

2045

// Load values from matrix A

2046

float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0));

2047

#else // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2048

// Load values from matrix A

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2049

float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2050

#endif // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2051

2052

// Load values from matrix B

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2053

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

2054

src_addr.s1 += src1_stride_y;

2055

float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

2056

src_addr.s1 += src1_stride_y;

2057

float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

2058

src_addr.s1 += src1_stride_y;

2059

float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

2060

src_addr.s1 += src1_stride_y;

2061

float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

2062

src_addr.s1 += src1_stride_y;

2063

float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

2064

src_addr.s1 += src1_stride_y;

2065

float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

2066

src_addr.s1 += src1_stride_y;

2067

float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

2068

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2069

2070

// Multiply and accumulate

2071

acc00 = fma(a0.s0, b0.s0, acc00);

2072

acc00 = fma(a0.s1, b1.s0, acc00);

2073

acc00 = fma(a0.s2, b2.s0, acc00);

2074

acc00 = fma(a0.s3, b3.s0, acc00);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2075

acc00 = fma(a0.s4, b4.s0, acc00);

2076

acc00 = fma(a0.s5, b5.s0, acc00);

2077

acc00 = fma(a0.s6, b6.s0, acc00);

2078

acc00 = fma(a0.s7, b7.s0, acc00);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2079

2080

acc01 = fma(a0.s0, b0.s1, acc01);

2081

acc01 = fma(a0.s1, b1.s1, acc01);

2082

acc01 = fma(a0.s2, b2.s1, acc01);

2083

acc01 = fma(a0.s3, b3.s1, acc01);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2084

acc01 = fma(a0.s4, b4.s1, acc01);

2085

acc01 = fma(a0.s5, b5.s1, acc01);

2086

acc01 = fma(a0.s6, b6.s1, acc01);

2087

acc01 = fma(a0.s7, b7.s1, acc01);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2088

2089

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2090

#if defined(REINTERPRET_INPUT_AS_3D)

2091

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

2092

#else // defined(REINTERPRET_INPUT_AS_3D)

2093

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

2094

#endif // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2095

acc10 = fma(a0.s0, b0.s0, acc10);

2096

acc10 = fma(a0.s1, b1.s0, acc10);

2097

acc10 = fma(a0.s2, b2.s0, acc10);

2098

acc10 = fma(a0.s3, b3.s0, acc10);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2099

acc10 = fma(a0.s4, b4.s0, acc10);

2100

acc10 = fma(a0.s5, b5.s0, acc10);

2101

acc10 = fma(a0.s6, b6.s0, acc10);

2102

acc10 = fma(a0.s7, b7.s0, acc10);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2103

2104

acc11 = fma(a0.s0, b0.s1, acc11);

2105

acc11 = fma(a0.s1, b1.s1, acc11);

2106

acc11 = fma(a0.s2, b2.s1, acc11);

2107

acc11 = fma(a0.s3, b3.s1, acc11);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2108

acc11 = fma(a0.s4, b4.s1, acc11);

2109

acc11 = fma(a0.s5, b5.s1, acc11);

2110

acc11 = fma(a0.s6, b6.s1, acc11);

2111

acc11 = fma(a0.s7, b7.s1, acc11);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2112

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2113

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2114

#if defined(REINTERPRET_INPUT_AS_3D)

2115

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

2116

#else // defined(REINTERPRET_INPUT_AS_3D)

2117

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

2118

#endif // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2119

acc20 = fma(a0.s0, b0.s0, acc20);

2120

acc20 = fma(a0.s1, b1.s0, acc20);

2121

acc20 = fma(a0.s2, b2.s0, acc20);

2122

acc20 = fma(a0.s3, b3.s0, acc20);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2123

acc20 = fma(a0.s4, b4.s0, acc20);

2124

acc20 = fma(a0.s5, b5.s0, acc20);

2125

acc20 = fma(a0.s6, b6.s0, acc20);

2126

acc20 = fma(a0.s7, b7.s0, acc20);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2127

2128

acc21 = fma(a0.s0, b0.s1, acc21);

2129

acc21 = fma(a0.s1, b1.s1, acc21);

2130

acc21 = fma(a0.s2, b2.s1, acc21);

2131

acc21 = fma(a0.s3, b3.s1, acc21);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2132

acc21 = fma(a0.s4, b4.s1, acc21);

2133

acc21 = fma(a0.s5, b5.s1, acc21);

2134

acc21 = fma(a0.s6, b6.s1, acc21);

2135

acc21 = fma(a0.s7, b7.s1, acc21);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2136

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2137

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2138

#if defined(REINTERPRET_INPUT_AS_3D)

2139

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

2140

#else // defined(REINTERPRET_INPUT_AS_3D)

2141

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

2142

#endif // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2143

acc30 = fma(a0.s0, b0.s0, acc30);

2144

acc30 = fma(a0.s1, b1.s0, acc30);

2145

acc30 = fma(a0.s2, b2.s0, acc30);

2146

acc30 = fma(a0.s3, b3.s0, acc30);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2147

acc30 = fma(a0.s4, b4.s0, acc30);

2148

acc30 = fma(a0.s5, b5.s0, acc30);

2149

acc30 = fma(a0.s6, b6.s0, acc30);

2150

acc30 = fma(a0.s7, b7.s0, acc30);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2151

2152

acc31 = fma(a0.s0, b0.s1, acc31);

2153

acc31 = fma(a0.s1, b1.s1, acc31);

2154

acc31 = fma(a0.s2, b2.s1, acc31);

2155

acc31 = fma(a0.s3, b3.s1, acc31);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2156

acc31 = fma(a0.s4, b4.s1, acc31);

2157

acc31 = fma(a0.s5, b5.s1, acc31);

2158

acc31 = fma(a0.s6, b6.s1, acc31);

2159

acc31 = fma(a0.s7, b7.s1, acc31);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2160

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2161

2162

src_addr.s0 += sizeof(float) * 8;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2163

}

2164

// float size increment

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2165

for(; i < (int)COLS_A; ++i)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2166

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2167

#if defined(REINTERPRET_INPUT_AS_3D)

2168

// Load values from matrix A

2169

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

2170

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2171

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

2172

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2173

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2174

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

2175

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2176

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2177

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

2178

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2179

#else // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2180

// Load values from matrix A

2181

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

2182

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2183

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

2184

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2185

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2186

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

2187

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2188

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2189

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

2190

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2191

#endif // defined(REINTERPRET_INPUT_AS_3D)

2192

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2193

// Load values from matrix B

2194

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2195

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2196

2197

// Multiply and accumulate

2198

acc00 = fma(a0, b0.s0, acc00);

2199

acc01 = fma(a0, b0.s1, acc01);

2200

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2201

acc10 = fma(a1, b0.s0, acc10);

2202

acc11 = fma(a1, b0.s1, acc11);

2203

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2204

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2205

acc20 = fma(a2, b0.s0, acc20);

2206

acc21 = fma(a2, b0.s1, acc21);

2207

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2208

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2209

acc30 = fma(a3, b0.s0, acc30);

2210

acc31 = fma(a3, b0.s1, acc31);

2211

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2212

2213

src_addr.s0 += sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2214

}

2215

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2216

// Multiply by the weight of matrix-matrix product and store the result

2217

#if defined(ALPHA)

2218

acc00 = acc00 * ALPHA;

2219

acc01 = acc01 * ALPHA;

2220

#endif // defined(ALPHA)

2221

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 && defined(ALPHA)

2222

acc10 = acc10 * ALPHA;

2223

acc11 = acc11 * ALPHA;

2224

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 && defined(ALPHA)

2225

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 && defined(ALPHA)

2226

acc20 = acc20 * ALPHA;

2227

acc21 = acc21 * ALPHA;

2228

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 && defined(ALPHA)

2229

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 && defined(ALPHA)

2230

acc30 = acc30 * ALPHA;

2231

acc31 = acc31 * ALPHA;

2232

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 && defined(ALPHA)

2233

2234

int z = get_global_id(2);

2235

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2236

// Compute destination address

2237

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2238

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2239

// Compute dst address

2240

__global uchar *dst_addr = offset(&dst, 0, 0);

2241

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2242

#if defined(REINTERPRET_OUTPUT_AS_3D)

2243

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

2244

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2245

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

2250

// |******************|

2251

// | cross_plane_pad |

2252

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2257

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2258

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

2259

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

2260

zout = min(DEPTH_GEMM3D - 1, zout);

2261

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

2262

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2263

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2264

2265

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2266

// multiply dst_stride_z by DEPTH_GEMM3D

2267

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

2268

2269

// Store the output block

2270

vstore2((float2)(acc00, acc01), 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2271

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2272

vstore2((float2)(acc10, acc11), 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2273

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2274

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2275

vstore2((float2)(acc20, acc21), 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2276

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2277

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2278

vstore2((float2)(acc30, acc31), 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2279

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2280

2281

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2282

// Add offset for batched GEMM

2283

dst_addr += z * dst_stride_z;

2284

2285

// Store the output block

2286

vstore2((float2)(acc00, acc01), 0, (__global float *)(dst_addr + 0 * dst_stride_y));

2287

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2288

vstore2((float2)(acc10, acc11), 0, (__global float *)(dst_addr + 1 * dst_stride_y));

2289

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2290

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2291

vstore2((float2)(acc20, acc21), 0, (__global float *)(dst_addr + 2 * dst_stride_y));

2292

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2293

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2294

vstore2((float2)(acc30, acc31), 0, (__global float *)(dst_addr + 3 * dst_stride_y));

2295

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2296

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2297

}

2298

Vidhya Sudhan Loganathan

bdff491

2018-05-22 15:03:09 +0100

[diff] [blame]

2299

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2300

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

2301

*

Vidhya Sudhan Loganathan

a25d16c

2018-11-16 11:33:12 +0000

[diff] [blame^]

2302

* @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.

2303

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

2304

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

2305

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

2306

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

2307

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

2308

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

2309

*

2310

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

2311

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

2312

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2313

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2314

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2315

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

2316

*

2317

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

2318

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

2319

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2320

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

2321

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2322

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

2323

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

2324

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

2325

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2326

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

2327

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2328

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

2329

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

2330

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2331

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2332

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2333

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2334

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2335

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

2336

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

2337

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2338

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

2339

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

2340

*/

2341

__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),

2342

IMAGE_DECLARATION(src1),

2343

IMAGE_DECLARATION(dst),

uint src0_stride_z,

uint src1_stride_z,

uint dst_stride_z

#if defined(REINTERPRET_INPUT_AS_3D)

2348

,

2349

uint src_cross_plane_pad

2350

#endif // REINTERPRET_INPUT_AS_3D

2351

#if defined(REINTERPRET_OUTPUT_AS_3D)

2352

,

2353

uint dst_cross_plane_pad

2354

#endif // REINTERPRET_OUTPUT_AS_3D

2355

)

2356

{

2357

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

2358

2359

// Compute starting address for matrix A and Matrix B

2360

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

2361

2362

// Update address for the matrix A

2363

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

2364

2365

// Update address for the matrix B

2366

src_addr.s1 += idx * sizeof(half);

2367

2368

#if defined(REINTERPRET_INPUT_AS_3D)

2369

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

2370

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

2376

// |******************|

2377

// | cross_plane_pad |

2378

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

2383

2384

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

2385

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

2386

zin = min(DEPTH_GEMM3D - 1, zin);

2387

2388

// Add offset due to the cross plane paddings

2389

zin *= (src_cross_plane_pad * src0_stride_y);

2390

2391

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2392

// multiply src0_stride_z by DEPTH_GEMM3D

2393

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

2394

2395

#else // defined(REINTERPRET_INPUT_AS_3D)

2396

2397

// Add offset for batched GEMM

2398

src_addr.s0 += get_global_id(2) * src0_stride_z;

2399

2400

#endif // defined(REINTERPRET_INPUT_AS_3D)

2401

2402

#if defined(MATRIX_B_DEPTH)

2403

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2404

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

2405

#else // defined(MATRIX_B_DEPTH)

2406

src_addr.s1 += get_global_id(2) * src1_stride_z;

2407

#endif // defined(MATRIX_B_DEPTH)

2408

2409

float8 acc0 = 0.0h;

2410

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2411

float8 acc1 = 0.0h;

2412

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2413

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2414

float8 acc2 = 0.0h;

2415

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2416

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2417

float8 acc3 = 0.0h;

2418

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2419

2420

int i = 0;

2421

for(; i <= ((int)COLS_A - 4); i += 4)

2422

{

2423

#if defined(REINTERPRET_INPUT_AS_3D)

2424

// Load values from matrix A

2425

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

2426

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2427

half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

2428

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2429

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2430

half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

2431

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2432

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2433

half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

2434

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2435

#else // defined(REINTERPRET_INPUT_AS_3D)

2436

// Load values from matrix A

2437

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

2438

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2439

half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

2440

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2441

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2442

half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

2443

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2444

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2445

half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

2446

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2447

#endif // defined(REINTERPRET_INPUT_AS_3D)

2448

2449

// Load values from matrix B

2450

float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

2451

src_addr.s1 += src1_stride_y;

2452

2453

// Accumulate

2454

acc0 = fma(b0, (float8)a0.s0, acc0);

2455

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2456

acc1 = fma(b0, (float8)a1.s0, acc1);

2457

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2458

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2459

acc2 = fma(b0, (float8)a2.s0, acc2);

2460

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2461

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2462

acc3 = fma(b0, (float8)a3.s0, acc3);

2463

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2464

2465

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

2466

src_addr.s1 += src1_stride_y;

2467

acc0 = fma(b0, (float8)a0.s1, acc0);

2468

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2469

acc1 = fma(b0, (float8)a1.s1, acc1);

2470

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2471

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2472

acc2 = fma(b0, (float8)a2.s1, acc2);

2473

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2474

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2475

acc3 = fma(b0, (float8)a3.s1, acc3);

2476

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2477

2478

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

2479

src_addr.s1 += src1_stride_y;

2480

acc0 = fma(b0, (float8)a0.s2, acc0);

2481

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2482

acc1 = fma(b0, (float8)a1.s2, acc1);

2483

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2484

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2485

acc2 = fma(b0, (float8)a2.s2, acc2);

2486

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2487

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2488

acc3 = fma(b0, (float8)a3.s2, acc3);

2489

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2490

2491

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

2492

src_addr.s1 += src1_stride_y;

2493

acc0 = fma(b0, (float8)a0.s3, acc0);

2494

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2495

acc1 = fma(b0, (float8)a1.s3, acc1);

2496

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2497

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2498

acc2 = fma(b0, (float8)a2.s3, acc2);

2499

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2500

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2501

acc3 = fma(b0, (float8)a3.s3, acc3);

2502

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2503

2504

src_addr.s0 += 4 * sizeof(half);

2505

}

2506

2507

for(; i < (int)COLS_A; ++i)

2508

{

2509

#if defined(REINTERPRET_INPUT_AS_3D)

2510

// Load values from matrix A

2511

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

2512

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2513

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

2514

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2515

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2516

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

2517

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2518

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2519

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

2520

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2521

#else // defined(REINTERPRET_INPUT_AS_3D)

2522

// Load values from matrix A

2523

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

2524

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2525

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

2526

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2527

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2528

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

2529

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2530

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2531

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

2532

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2533

#endif // defined(REINTERPRET_INPUT_AS_3D)

2534

2535

// Load values from matrix B

2536

float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

2537

2538

src_addr += (int2)(sizeof(half), src1_stride_y);

2539

2540

// Accumulate

2541

acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;

2542

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2543

acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;

2544

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2545

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2546

acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;

2547

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2548

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2549

acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;

2550

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2551

}

2552

2553

// Multiply by the weight of matrix-matrix product and store the result

2554

#if defined(ALPHA)

2555

half8 hacc0 = convert_half8(acc0) * (half8)ALPHA;

2556

#else //defined(ALPHA)

2557

half8 hacc0 = convert_half8(acc0);

2558

#endif // defined(ALPHA)

2559

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2560

#if defined(ALPHA)

2561

half8 hacc1 = convert_half8(acc1) * (half8)ALPHA;

2562

#else //defined(ALPHA)

2563

half8 hacc1 = convert_half8(acc1);

2564

#endif //defined(ALPHA)

2565

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y

2566

2567

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2568

#if defined(ALPHA)

2569

half8 hacc2 = convert_half8(acc2) * (half8)ALPHA;

2570

#else //defined(ALPHA)

2571

half8 hacc2 = convert_half8(acc2);

2572

#endif //defined(ALPHA)

2573

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2574

2575

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2576

#if defined(ALPHA)

2577

half8 hacc3 = convert_half8(acc3) * (half8)ALPHA;

2578

#else //defined(ALPHA)

2579

half8 hacc3 = convert_half8(acc3);

2580

#endif // defined(ALPHA)

2581

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2582

2583

int z = get_global_id(2);

2584

2585

// Compute destination address

2586

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2587

2588

// Compute dst address

2589

__global uchar *dst_addr = offset(&dst, 0, 0);

2590

2591

#if defined(REINTERPRET_OUTPUT_AS_3D)

2592

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

2593

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

2599

// |******************|

2600

// | cross_plane_pad |

2601

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

2606

2607

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

2608

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

2609

zout = min(DEPTH_GEMM3D - 1, zout);

2610

2611

// Add offset due to the cross plane paddings

2612

zout *= (dst_cross_plane_pad * dst_stride_y);

2613

2614

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2615

// multiply dst_stride_z by DEPTH_GEMM3D

2616

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

2617

2618

// Store the output block

2619

vstore8(hacc0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

2620

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2621

vstore8(hacc1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

2622

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2623

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2624

vstore8(hacc2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

2625

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2626

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2627

vstore8(hacc3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

2628

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2629

2630

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2631

// Add offset for batched GEMM

2632

dst_addr += z * dst_stride_z;

2633

2634

// Store the output block

2635

vstore8(hacc0, 0, (__global half *)(dst_addr + 0 * dst_stride_y));

2636

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2637

vstore8(hacc1, 0, (__global half *)(dst_addr + 1 * dst_stride_y));

2638

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2639

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2640

vstore8(hacc2, 0, (__global half *)(dst_addr + 2 * dst_stride_y));

2641

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2642

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2643

vstore8(hacc3, 0, (__global half *)(dst_addr + 3 * dst_stride_y));

2644

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2645

#endif // REINTERPRET_OUTPUT_AS_3D

2646

}

2647

2648

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

2649

*

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2650

* @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.

2651

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

2652

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

2653

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

2654

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

2655

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

2656

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

2657

*

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2658

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

2659

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2660

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2661

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2662

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2663

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

2664

*

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2665

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

2666

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

2667

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2668

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

2669

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2670

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

2671

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

2672

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

2673

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2674

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

2675

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2676

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

2677

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

2678

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2679

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2680

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2681

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2682

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2683

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

2684

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

2685

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2686

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

2687

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2688

*/

2689

__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),

2690

IMAGE_DECLARATION(src1),

2691

IMAGE_DECLARATION(dst),

2692

uint src0_stride_z,

2693

uint src1_stride_z,

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2694

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2695

#if defined(REINTERPRET_INPUT_AS_3D)

2696

,

2697

uint src_cross_plane_pad

2698

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2699

#if defined(REINTERPRET_OUTPUT_AS_3D)

2700

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2701

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2702

#endif // REINTERPRET_OUTPUT_AS_3D

2703

)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2704

{

2705

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

2706

2707

// Compute starting address for matrix A and Matrix B

2708

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

2709

2710

// Update address for the matrix A

2711

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

2712

2713

// Update address for the matrix B

2714

src_addr.s1 += idx * sizeof(half);

2715

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2716

#if defined(REINTERPRET_INPUT_AS_3D)

2717

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

2718

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

2724

// |******************|

2725

// | cross_plane_pad |

2726

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

2731

2732

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

2733

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

2734

zin = min(DEPTH_GEMM3D - 1, zin);

2735

2736

// Add offset due to the cross plane paddings

2737

zin *= (src_cross_plane_pad * src0_stride_y);

2738

2739

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2740

// multiply src0_stride_z by DEPTH_GEMM3D

2741

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

2742

2743

#else // defined(REINTERPRET_INPUT_AS_3D)

2744

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2745

// Add offset for batched GEMM

2746

src_addr.s0 += get_global_id(2) * src0_stride_z;

2747

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2748

#endif // defined(REINTERPRET_INPUT_AS_3D)

2749

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2750

#if defined(MATRIX_B_DEPTH)

2751

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2752

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

2753

#else // defined(MATRIX_B_DEPTH)

2754

src_addr.s1 += get_global_id(2) * src1_stride_z;

2755

#endif // defined(MATRIX_B_DEPTH)

2756

2757

half8 acc0 = 0.0h;

2758

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2759

half8 acc1 = 0.0h;

2760

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2761

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2762

half8 acc2 = 0.0h;

2763

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2764

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2765

half8 acc3 = 0.0h;

2766

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2767

2768

int i = 0;

2769

for(; i <= ((int)COLS_A - 4); i += 4)

2770

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2771

#if defined(REINTERPRET_INPUT_AS_3D)

2772

// Load values from matrix A

2773

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

2774

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2775

half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

2776

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2777

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2778

half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

2779

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2780

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2781

half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

2782

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2783

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2784

// Load values from matrix A

2785

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

2786

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2787

half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

2788

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2789

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2790

half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

2791

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2792

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2793

half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

2794

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2795

#endif // defined(REINTERPRET_INPUT_AS_3D)

2796

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2797

// Load values from matrix B

2798

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

2799

src_addr.s1 += src1_stride_y;

2800

2801

// Accumulate

2802

acc0 = fma(b0, (half8)a0.s0, acc0);

2803

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2804

acc1 = fma(b0, (half8)a1.s0, acc1);

2805

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2806

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2807

acc2 = fma(b0, (half8)a2.s0, acc2);

2808

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2809

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2810

acc3 = fma(b0, (half8)a3.s0, acc3);

2811

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2812

2813

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

2814

src_addr.s1 += src1_stride_y;

2815

acc0 = fma(b0, (half8)a0.s1, acc0);

2816

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2817

acc1 = fma(b0, (half8)a1.s1, acc1);

2818

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2819

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2820

acc2 = fma(b0, (half8)a2.s1, acc2);

2821

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2822

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2823

acc3 = fma(b0, (half8)a3.s1, acc3);

2824

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2825

2826

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

2827

src_addr.s1 += src1_stride_y;

2828

acc0 = fma(b0, (half8)a0.s2, acc0);

2829

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2830

acc1 = fma(b0, (half8)a1.s2, acc1);

2831

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2832

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2833

acc2 = fma(b0, (half8)a2.s2, acc2);

2834

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2835

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2836

acc3 = fma(b0, (half8)a3.s2, acc3);

2837

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2838

2839

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

2840

src_addr.s1 += src1_stride_y;

2841

acc0 = fma(b0, (half8)a0.s3, acc0);

2842

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2843

acc1 = fma(b0, (half8)a1.s3, acc1);

2844

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2845

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2846

acc2 = fma(b0, (half8)a2.s3, acc2);

2847

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2848

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2849

acc3 = fma(b0, (half8)a3.s3, acc3);

2850

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2851

2852

src_addr.s0 += 4 * sizeof(half);

2853

}

2854

2855

for(; i < (int)COLS_A; ++i)

2856

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2857

#if defined(REINTERPRET_INPUT_AS_3D)

2858

// Load values from matrix A

2859

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

2860

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2861

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

2862

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2863

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2864

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

2865

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2866

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2867

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

2868

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2869

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2870

// Load values from matrix A

2871

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

2872

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2873

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

2874

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2875

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2876

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

2877

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2878

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2879

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

2880

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2881

#endif // defined(REINTERPRET_INPUT_AS_3D)

2882

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2883

// Load values from matrix B

2884

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

2885

2886

src_addr += (int2)(sizeof(half), src1_stride_y);

2887

2888

// Accumulate

2889

acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;

2890

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2891

acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;

2892

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2893

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2894

acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;

2895

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2896

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2897

acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;

2898

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2899

}

2900

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2901

// Multiply by the weight of matrix-matrix product and store the result

2902

#if defined(ALPHA)

2903

acc0 = acc0 * (half8)ALPHA;

2904

#endif // defined(ALPHA)

2905

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 && defined(ALPHA)

2906

acc1 = acc1 * (half8)ALPHA;

2907

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 && defined(ALPHA)

2908

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 && defined(ALPHA)

2909

acc2 = acc2 * (half8)ALPHA;

2910

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 && defined(ALPHA)

2911

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 && defined(ALPHA)

2912

acc3 = acc3 * (half8)ALPHA;

2913

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 && defined(ALPHA)

2914

2915

int z = get_global_id(2);

2916

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2917

// Compute destination address

2918

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2919

2920

// Compute dst address

2921

__global uchar *dst_addr = offset(&dst, 0, 0);

2922

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2923

#if defined(REINTERPRET_OUTPUT_AS_3D)

2924

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

2925

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2926

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

2931

// |******************|

2932

// | cross_plane_pad |

2933

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2938

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2939

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

2940

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

2941

zout = min(DEPTH_GEMM3D - 1, zout);

2942

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

2943

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2944

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2945

2946

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2947

// multiply dst_stride_z by DEPTH_GEMM3D

2948

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

2949

2950

// Store the output block

2951

vstore8(acc0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

2952

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2953

vstore8(acc1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

2954

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2955

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2956

vstore8(acc2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

2957

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2958

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2959

vstore8(acc3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

2960

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2961

2962

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2963

// Add offset for batched GEMM

2964

dst_addr += z * dst_stride_z;

2965

2966

// Store the output block

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2967

vstore8(acc0, 0, (__global half *)(dst_addr + 0 * dst_stride_y));

2968

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2969

vstore8(acc1, 0, (__global half *)(dst_addr + 1 * dst_stride_y));

2970

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2971

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2972

vstore8(acc2, 0, (__global half *)(dst_addr + 2 * dst_stride_y));

2973

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2974

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2975

vstore8(acc3, 0, (__global half *)(dst_addr + 3 * dst_stride_y));

2976

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2977

#endif // REINTERPRET_OUTPUT_AS_3D

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2978

}

Vidhya Sudhan Loganathan

bdff491

2018-05-22 15:03:09 +0100

[diff] [blame]

2979

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2980

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2981

#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2982

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2983

#if defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2984

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

2985

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

2986

* @note The beta's value need to be passed at compile time using -DBETA

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2987

*

2988

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F32

2989

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

2990

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2991

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

2992

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2993

* @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)

2994

* @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2995

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2996

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2997

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2998

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2999

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

3000

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3001

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

3002

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3003

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

3004

*/

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3005

__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),

3006

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3007

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3008

// Compute source and destination addresses

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3009

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

3010

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3011

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3012

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3013

float4 alpha_ab = vload4(0, (__global float *)dst.ptr);

3014

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3015

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3016

float4 c = vload4(0, (__global float *)src.ptr);

3017

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3018

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3019

float4 out = alpha_ab + (float4)BETA * c;

3020

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3021

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3022

vstore4(out, 0, (__global float *)dst.ptr);

3023

}

3024

Vidhya Sudhan Loganathan

76c8564

2018-05-25 13:53:02 +0100

[diff] [blame]

3025

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3026

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

3027

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

3028

* @note The beta's value need to be passed at compile time using -DBETA

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

3029

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3030

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F16

3031

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

3032

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3033

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

3034

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3035

* @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)

3036

* @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3037

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

3038

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3039

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

3040

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

3041

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

3042

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3043

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

3044

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3045

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

3046

*/

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3047

__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),

3048

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3049

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3050

// Compute source and destination addresses

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3051

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

3052

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3053

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3054

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3055

half8 alpha_ab = vload8(0, (__global half *)dst.ptr);

3056

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3057

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3058

half8 c = vload8(0, (__global half *)src.ptr);

3059

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3060

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3061

half8 out = alpha_ab + (half8)BETA * c;

3062

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3063

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3064

vstore8(out, 0, (__global half *)dst.ptr);

3065

}

Vidhya Sudhan Loganathan

76c8564

2018-05-25 13:53:02 +0100

[diff] [blame]

3066

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3067

#endif // defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3068

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3069

#if defined(WIDTH_VECTOR_A)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3070

/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer

3071

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

3072

* @note The width of A need to be passed at compile time using -DWIDTH_VECTOR_A

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3073

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

3074

* @note The input A and matrix B must not be reshaped

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3075

*

3076

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

3077

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

3078

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3079

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

3080

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3081

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

3082

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3083

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

3084

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3085

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

3086

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3087

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

3088

* @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

3089

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

3090

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3091

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

3092

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

3093

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

3094

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

3095

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

3096

*/

3097

__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),

3098

TENSOR3D_DECLARATION(src1),

3099

IMAGE_DECLARATION(dst))

3100

{

3101

int idx = get_global_id(0) * 4;

3102

int idy = get_global_id(1);

3103

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3104

// Compute the address for the vector A and matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3105

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));

3106

src_addr.s1 += idx * sizeof(float);

3107

3108

int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));

float4 acc = 0.0f;

Georgios Pinitas

2017-10-20 18:52:20 +0100

[diff] [blame]

3112

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3113

{

3114

float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));

3115

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

3116

float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));

3117

3118

acc += b0 * (float4)a0.s0;

3119

acc += b1 * (float4)a0.s1;

3120

}

3121

3122

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))

3123

{

3124

float a0 = *((__global float *)(src0_ptr + src_addr.s0));

3125

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

3126

3127

acc += b0 * (float4)a0;

3128

}

3129

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3130

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3131

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

3132

3133

vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));

3134

}

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3135

#endif // defined(WIDTH_VECTOR_A)

3136

3137

/** This kernel accumulates each row with the biases vector.

3138

*

3139

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=short.

3140

* @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=16.

3141

*

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

3142

* @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: U8/S8/U16/S16/F16/U32/S32/F32

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3143

* @param[in] accum_stride_x Stride of the accmulate tensor in X dimension (in bytes)

3144

* @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes)

3145

* @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes)

3146

* @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3147

* @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor

3148

* @param[in] biases_ptr Pointer to the biases vector. Same as @p accum_ptr

3149

* @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes)

3150

* @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

3151

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor

3152

*/

3153

#if defined(DATA_TYPE) && defined(VECTOR_SIZE)

3154

__kernel void gemm_accumulate_biases(

3155

IMAGE_DECLARATION(accum),

3156

VECTOR_DECLARATION(biases))

3157

{

3158

Image accum = CONVERT_TO_IMAGE_STRUCT(accum);

3159

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

3160

3161

// Vector size, i.e. number of vector elements.

3162

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

3163

accum_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)accum.ptr);

3164

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

3165

biases_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)biases.ptr);

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

3166

accum_value = biases_value + accum_value;

Anton Lokhmotov