Blame - src/core/CL/cl_kernels/gemm.cl - ml/ComputeLibrary

2018-01-12 10:21:40 +0000

[diff] [blame]

42

*

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

43

* @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

44

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

45

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

46

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

47

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

48

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

49

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

50

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

51

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

52

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

53

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

54

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

55

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

56

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

57

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

58

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

59

*/

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

60

__kernel void gemm_transpose1xW(TENSOR3D_DECLARATION(src),

61

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

62

{

63

uint x = get_global_id(0);

64

uint y = get_global_id(1);

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

65

uint z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

66

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

67

// Compute address for Matrix B - source

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

68

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

69

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

70

// Compute address for Matrix B transposed - destination. X and Y are swapped

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

71

uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + y * TRANSPOSE_W * sizeof(DATA_TYPE) * MULT_TRANSPOSE1XW_WIDTH + (x / MULT_TRANSPOSE1XW_WIDTH) * dst_stride_y +

72

(x % MULT_TRANSPOSE1XW_WIDTH) * TRANSPOSE_W * sizeof(DATA_TYPE);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

73

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

74

// Add offset for batched GEMM

75

dst_addr_in_bytes += z * dst_stride_z;

76

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

77

VEC_DATA_TYPE(DATA_TYPE, TRANSPOSE_W)

78

b0 = VLOAD(TRANSPOSE_W)(0, (__global DATA_TYPE *)src.ptr);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

79

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

80

VSTORE(TRANSPOSE_W)

81

(b0, 0, (__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

82

}

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

83

#endif // defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

84

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

85

#if defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)

86

87

/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

88

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

89

* @note The data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=float)

90

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

91

* @note In case the input has to be reinterpreted as a 3D tensor (i.e. input of convolution layer 1x1), the following information must be passed at compile time:

92

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

93

* -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.

94

* -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor

95

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

96

*

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

97

* @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

98

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

99

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

100

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

101

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

102

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

103

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

104

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

105

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

106

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

107

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

108

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

109

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

110

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

111

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

112

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

113

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

114

*/

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

115

__kernel void gemm_interleave4x4(TENSOR3D_DECLARATION(src),

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

116

TENSOR3D_DECLARATION(dst)

117

#if defined(REINTERPRET_INPUT_AS_3D)

118

,

119

uint cross_plane_pad

120

#endif // REINTERPRET_INPUT_AS_3D

121

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

122

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

123

// Compute source and destination addresses

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

124

uint x = get_global_id(0);

125

uint y = get_global_id(1);

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

126

uint z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

127

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

128

// Compute address for source tensor

129

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

130

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

131

// Compute address for Matrix B transposed - destination. X and Y are swapped

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

132

uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * 16 * MULT_INTERLEAVE4X4_HEIGHT + (y / MULT_INTERLEAVE4X4_HEIGHT) * dst_stride_y +

133

(y % MULT_INTERLEAVE4X4_HEIGHT) * 4 * sizeof(DATA_TYPE);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

134

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

135

// Add offset for batched GEMM

136

dst_addr_in_bytes += z * dst_stride_z;

137

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

138

#if defined(REINTERPRET_INPUT_AS_3D)

139

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * 4 * sizeof(DATA_TYPE) + y * 4 * src_stride_y;

140

141

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

142

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

148

// |******************|

149

// | cross_plane_pad |

150

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

155

156

// The plane (zin) is calculated dividing M (y * 4) by HEIGHT_GEMM3D

157

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(y * 4)) / (uint4)HEIGHT_GEMM3D;

158

zin = min(DEPTH_GEMM3D - 1, zin);

159

160

// Add offset due to the cross plane paddings

161

zin *= (cross_plane_pad * src_stride_y);

162

163

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

164

// multiply src_stride_z by DEPTH_GEMM3D

165

input_ptr += z * src_stride_z * DEPTH_GEMM3D;

166

167

// Load values from Matrix A

168

VEC_DATA_TYPE(DATA_TYPE, 4)

169

a0 = vload4(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y + zin.s0));

170

VEC_DATA_TYPE(DATA_TYPE, 4)

171

a1 = vload4(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y + zin.s1));

172

VEC_DATA_TYPE(DATA_TYPE, 4)

173

a2 = vload4(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y + zin.s2));

174

VEC_DATA_TYPE(DATA_TYPE, 4)

175

a3 = vload4(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y + zin.s3));

176

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

177

__global uchar *input_ptr = src.ptr;

178

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

179

// Load values from Matrix A

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

180

VEC_DATA_TYPE(DATA_TYPE, 4)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

181

a0 = vload4(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

182

VEC_DATA_TYPE(DATA_TYPE, 4)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

183

a1 = vload4(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

184

VEC_DATA_TYPE(DATA_TYPE, 4)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

185

a2 = vload4(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

186

VEC_DATA_TYPE(DATA_TYPE, 4)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

187

a3 = vload4(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

188

#endif // defined(REINTERPRET_INPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

189

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

190

VEC_DATA_TYPE(DATA_TYPE, 4)

191

val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s0, a1.s0, a2.s0, a3.s0);

192

vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 0 * MULT_INTERLEAVE4X4_HEIGHT));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

193

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

194

val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s1, a1.s1, a2.s1, a3.s1);

195

vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 4 * MULT_INTERLEAVE4X4_HEIGHT));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

196

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

197

val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s2, a1.s2, a2.s2, a3.s2);

198

vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 8 * MULT_INTERLEAVE4X4_HEIGHT));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

199

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

200

val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s3, a1.s3, a2.s3, a3.s3);

201

vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 12 * MULT_INTERLEAVE4X4_HEIGHT));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

202

}

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

203

#endif // defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

204

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

205

#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

206

/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

207

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

208

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

209

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

210

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)

211

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

212

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

213

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

214

*

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

215

* @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:

216

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

217

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

218

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

219

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

220

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

221

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

222

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

223

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

224

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

225

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

226

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

227

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

228

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

229

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

230

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

231

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

232

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

233

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

234

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

235

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

236

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

237

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

238

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

239

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

240

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

241

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

242

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

243

*/

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

244

__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),

245

IMAGE_DECLARATION(src1),

246

IMAGE_DECLARATION(dst),

247

uint src0_stride_z,

248

uint src1_stride_z,

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

249

uint dst_stride_z

250

#if defined(REINTERPRET_OUTPUT_AS_3D)

251

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

252

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

253

#endif // REINTERPRET_OUTPUT_AS_3D

254

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

255

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

256

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

257

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

258

int z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

259

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

260

// Offset

261

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

262

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

263

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

264

// src_addr_a = address of matrix A

265

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

266

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

267

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

268

269

#if defined(MATRIX_B_DEPTH)

270

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

271

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

272

#else // defined(MATRIX_B_DEPTH)

273

src1_addr_in_bytes += z * src1_stride_z;

274

#endif // defined(MATRIX_B_DEPTH)

275

276

__global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);

277

__global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

278

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

279

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

280

__global float *src_end_addr_b = src_addr_b + COLS_B;

281

282

src_addr_a += offset_row_a;

283

src_addr_b += offset_row_b;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

284

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

285

// Reset accumulators

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

float4 c00 = 0.0f;

float4 c10 = 0.0f;

float4 c20 = 0.0f;

float4 c30 = 0.0f;

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

291

for(; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

292

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

293

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

294

float4 a0 = vload4(0, src_addr_a);

295

float4 b0 = vload4(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

296

297

c00 += (float4)a0.s0 * b0;

298

c10 += (float4)a0.s1 * b0;

299

c20 += (float4)a0.s2 * b0;

300

c30 += (float4)a0.s3 * b0;

301

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

302

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

303

a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);

304

b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

305

306

c00 += (float4)a0.s0 * b0;

307

c10 += (float4)a0.s1 * b0;

308

c20 += (float4)a0.s2 * b0;

309

c30 += (float4)a0.s3 * b0;

310

}

311

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

312

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

313

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

314

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

315

float4 a0 = vload4(0, src_addr_a);

316

float4 b0 = vload4(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

317

318

c00 += (float4)a0.s0 * b0;

319

c10 += (float4)a0.s1 * b0;

320

c20 += (float4)a0.s2 * b0;

321

c30 += (float4)a0.s3 * b0;

322

}

323

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

324

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

325

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

326

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

327

#if defined(ALPHA)

328

// Multiply by the weight of matrix product

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

329

c00 = c00 * (float4)ALPHA;

330

c10 = c10 * (float4)ALPHA;

331

c20 = c20 * (float4)ALPHA;

332

c30 = c30 * (float4)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

333

#endif // defined(ALPHA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

334

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

335

// Compute dst address

336

__global uchar *dst_addr = offset(&dst, 0, 0);

337

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

338

#if defined(REINTERPRET_OUTPUT_AS_3D)

339

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

340

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

341

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

346

// |******************|

347

// | cross_plane_pad |

348

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

353

354

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

355

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

356

zout = min(DEPTH_GEMM3D - 1, zout);

357

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

358

// Add offset due to the cross plane paddings

359

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

360

361

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

362

// multiply dst_stride_z by DEPTH_GEMM3D

363

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

364

365

// Store 4x4 block

366

vstore4(c00, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

367

vstore4(c10, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

368

vstore4(c20, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

369

vstore4(c30, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

370

371

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

372

// Add offset for batched GEMM

373

dst_addr += z * dst_stride_z;

374

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

375

// Store 4x4 block

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

376

vstore4(c00, 0, (__global float *)(dst_addr + 0 * dst_stride_y));

377

vstore4(c10, 0, (__global float *)(dst_addr + 1 * dst_stride_y));

378

vstore4(c20, 0, (__global float *)(dst_addr + 2 * dst_stride_y));

379

vstore4(c30, 0, (__global float *)(dst_addr + 3 * dst_stride_y));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

380

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

381

}

382

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

383

/** This OpenCL kernel is optimized for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

384

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

385

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

386

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

387

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)

388

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

389

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

390

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

391

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

392

*

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

393

* @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:

394

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

395

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

396

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

397

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

398

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

399

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

400

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

401

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

402

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

403

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

404

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

405

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

406

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

407

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

408

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

409

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

410

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

411

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

412

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

413

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

414

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

415

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

416

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

417

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

418

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

419

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

420

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

421

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

422

__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),

423

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

424

IMAGE_DECLARATION(dst),

425

uint src0_stride_z,

426

uint src1_stride_z,

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

427

uint dst_stride_z

428

#if defined(REINTERPRET_OUTPUT_AS_3D)

429

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

430

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

431

#endif // REINTERPRET_OUTPUT_AS_3D

432

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

433

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

434

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

435

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

436

int z = get_global_id(2);

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

437

438

// Offset

439

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

440

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;

441

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

442

// src_addr_a = address of matrix A

443

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

444

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

445

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

446

447

#if defined(MATRIX_B_DEPTH)

448

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

449

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

450

#else // defined(MATRIX_B_DEPTH)

451

src1_addr_in_bytes += z * src1_stride_z;

452

#endif // defined(MATRIX_B_DEPTH)

453

454

__global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);

455

__global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

456

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

457

src_addr_a += offset_row_a;

458

src_addr_b += offset_row_b;

459

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

460

// Reset accumulators

float c00 = 0.0f;

float c01 = 0.0f;

float c02 = 0.0f;

float c03 = 0.0f;

float c10 = 0.0f;

float c11 = 0.0f;

float c12 = 0.0f;

float c13 = 0.0f;

float c20 = 0.0f;

float c21 = 0.0f;

float c22 = 0.0f;

float c23 = 0.0f;

float c30 = 0.0f;

float c31 = 0.0f;

float c32 = 0.0f;

float c33 = 0.0f;

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

478

#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH))

479

480

int i = 0;

481

for(; i <= (int)(COLS_MTX_B - 4); i += 4)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

482

{

483

// Load values from matrix A (interleaved) and matrix B (transposed)

484

float4 a0 = vload4(0, src_addr_a);

485

float4 b0 = vload4(0, src_addr_b);

486

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

487

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

488

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

489

490

c00 = fma(a0.s0, b0.s0, c00);

491

c01 = fma(a0.s0, b0.s1, c01);

492

c02 = fma(a0.s0, b0.s2, c02);

493

c03 = fma(a0.s0, b0.s3, c03);

494

495

c10 = fma(a0.s1, b0.s0, c10);

496

c11 = fma(a0.s1, b0.s1, c11);

497

c12 = fma(a0.s1, b0.s2, c12);

498

c13 = fma(a0.s1, b0.s3, c13);

499

500

c20 = fma(a0.s2, b0.s0, c20);

501

c21 = fma(a0.s2, b0.s1, c21);

502

c22 = fma(a0.s2, b0.s2, c22);

503

c23 = fma(a0.s2, b0.s3, c23);

504

505

c30 = fma(a0.s3, b0.s0, c30);

506

c31 = fma(a0.s3, b0.s1, c31);

507

c32 = fma(a0.s3, b0.s2, c32);

508

c33 = fma(a0.s3, b0.s3, c33);

509

510

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

511

a0 = vload4(0, src_addr_a);

512

b0 = vload4(0, src_addr_b);

513

514

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

515

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

516

517

c00 = fma(a0.s0, b0.s0, c00);

518

c01 = fma(a0.s0, b0.s1, c01);

519

c02 = fma(a0.s0, b0.s2, c02);

520

c03 = fma(a0.s0, b0.s3, c03);

521

522

c10 = fma(a0.s1, b0.s0, c10);

523

c11 = fma(a0.s1, b0.s1, c11);

524

c12 = fma(a0.s1, b0.s2, c12);

525

c13 = fma(a0.s1, b0.s3, c13);

526

527

c20 = fma(a0.s2, b0.s0, c20);

528

c21 = fma(a0.s2, b0.s1, c21);

529

c22 = fma(a0.s2, b0.s2, c22);

530

c23 = fma(a0.s2, b0.s3, c23);

531

532

c30 = fma(a0.s3, b0.s0, c30);

533

c31 = fma(a0.s3, b0.s1, c31);

534

c32 = fma(a0.s3, b0.s2, c32);

535

c33 = fma(a0.s3, b0.s3, c33);

536

537

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

538

a0 = vload4(0, src_addr_a);

539

b0 = vload4(0, src_addr_b);

540

541

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

542

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

543

544

c00 = fma(a0.s0, b0.s0, c00);

545

c01 = fma(a0.s0, b0.s1, c01);

546

c02 = fma(a0.s0, b0.s2, c02);

547

c03 = fma(a0.s0, b0.s3, c03);

548

549

c10 = fma(a0.s1, b0.s0, c10);

550

c11 = fma(a0.s1, b0.s1, c11);

551

c12 = fma(a0.s1, b0.s2, c12);

552

c13 = fma(a0.s1, b0.s3, c13);

553

554

c20 = fma(a0.s2, b0.s0, c20);

555

c21 = fma(a0.s2, b0.s1, c21);

556

c22 = fma(a0.s2, b0.s2, c22);

557

c23 = fma(a0.s2, b0.s3, c23);

558

559

c30 = fma(a0.s3, b0.s0, c30);

560

c31 = fma(a0.s3, b0.s1, c31);

561

c32 = fma(a0.s3, b0.s2, c32);

562

c33 = fma(a0.s3, b0.s3, c33);

563

564

// Load values from matrix A (interleaved) and matrix B (transposed)

565

a0 = vload4(0, src_addr_a);

566

b0 = vload4(0, src_addr_b);

567

568

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

569

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

570

571

c00 = fma(a0.s0, b0.s0, c00);

572

c01 = fma(a0.s0, b0.s1, c01);

573

c02 = fma(a0.s0, b0.s2, c02);

574

c03 = fma(a0.s0, b0.s3, c03);

575

576

c10 = fma(a0.s1, b0.s0, c10);

577

c11 = fma(a0.s1, b0.s1, c11);

578

c12 = fma(a0.s1, b0.s2, c12);

579

c13 = fma(a0.s1, b0.s3, c13);

580

581

c20 = fma(a0.s2, b0.s0, c20);

582

c21 = fma(a0.s2, b0.s1, c21);

583

c22 = fma(a0.s2, b0.s2, c22);

584

c23 = fma(a0.s2, b0.s3, c23);

585

586

c30 = fma(a0.s3, b0.s0, c30);

587

c31 = fma(a0.s3, b0.s1, c31);

588

c32 = fma(a0.s3, b0.s2, c32);

589

c33 = fma(a0.s3, b0.s3, c33);

590

}

591

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

592

for(; i < (int)(COLS_MTX_B); ++i)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

593

{

594

// Load values from matrix A (interleaved) and matrix B (transposed)

595

float4 a0 = vload4(0, src_addr_a);

596

float4 b0 = vload4(0, src_addr_b);

597

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

598

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

599

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

600

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

601

c00 = fma(a0.s0, b0.s0, c00);

602

c01 = fma(a0.s0, b0.s1, c01);

603

c02 = fma(a0.s0, b0.s2, c02);

604

c03 = fma(a0.s0, b0.s3, c03);

605

606

c10 = fma(a0.s1, b0.s0, c10);

607

c11 = fma(a0.s1, b0.s1, c11);

608

c12 = fma(a0.s1, b0.s2, c12);

609

c13 = fma(a0.s1, b0.s3, c13);

610

611

c20 = fma(a0.s2, b0.s0, c20);

612

c21 = fma(a0.s2, b0.s1, c21);

613

c22 = fma(a0.s2, b0.s2, c22);

614

c23 = fma(a0.s2, b0.s3, c23);

615

616

c30 = fma(a0.s3, b0.s0, c30);

617

c31 = fma(a0.s3, b0.s1, c31);

618

c32 = fma(a0.s3, b0.s2, c32);

619

c33 = fma(a0.s3, b0.s3, c33);

620

}

621

622

// Compute destination address

623

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

624

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

625

#if defined(ALPHA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

626

// Multiply by the weight of matrix product

c00 = c00 * ALPHA;

c01 = c01 * ALPHA;

c02 = c02 * ALPHA;

c03 = c03 * ALPHA;

c10 = c10 * ALPHA;

c11 = c11 * ALPHA;

c12 = c12 * ALPHA;

c13 = c13 * ALPHA;

c20 = c20 * ALPHA;

c21 = c21 * ALPHA;

c22 = c22 * ALPHA;

c23 = c23 * ALPHA;

c30 = c30 * ALPHA;

c31 = c31 * ALPHA;

c32 = c32 * ALPHA;

c33 = c33 * ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

643

#endif // defined(ALPHA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

644

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

645

// Compute dst address

646

__global uchar *dst_addr = offset(&dst, 0, 0);

647

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

648

#if defined(REINTERPRET_OUTPUT_AS_3D)

649

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

650

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

651

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

656

// |******************|

657

// | cross_plane_pad |

658

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

663

664

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

665

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

666

zout = min(DEPTH_GEMM3D - 1, zout);

667

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

668

// Add offset due to the cross plane paddings

669

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

670

671

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

672

// multiply dst_stride_z by DEPTH_GEMM3D

673

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

674

675

// Store 4x4 block

676

vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

677

vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

678

vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

679

vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

680

681

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

682

// Add offset for batched GEMM

683

dst_addr += z * dst_stride_z;

684

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

685

// Store 4x4 block

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

686

vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(dst_addr + 0 * dst_stride_y));

687

vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(dst_addr + 1 * dst_stride_y));

688

vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(dst_addr + 2 * dst_stride_y));

689

vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(dst_addr + 3 * dst_stride_y));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

690

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

691

}

692

Georgios Pinitas

8422558

2018-05-14 12:00:05 +0100

[diff] [blame]

693

// Undefine local defines

694

#undef COLS_MTX_B

695

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

696

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

697

/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

698

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

699

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

700

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

701

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)

702

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

703

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

704

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

705

*

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

706

* @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:

707

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

708

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

709

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

710

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

711

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

712

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

713

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

714

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

715

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

716

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

717

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

718

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

719

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

720

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

721

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

722

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

723

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

724

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

725

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

726

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

727

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

728

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

729

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

730

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

731

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

732

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

733

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

734

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

735

__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),

736

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

737

IMAGE_DECLARATION(dst),

738

uint src0_stride_z,

739

uint src1_stride_z,

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

740

uint dst_stride_z

741

#if defined(REINTERPRET_OUTPUT_AS_3D)

742

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

743

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

744

#endif // REINTERPRET_OUTPUT_AS_3D

745

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

746

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

747

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

748

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

749

int z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

750

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

751

// Offset

752

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

753

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

754

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

755

// src_addr_a = address of matrix A

756

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

757

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

758

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

759

760

#if defined(MATRIX_B_DEPTH)

761

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

762

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

763

#else // defined(MATRIX_B_DEPTH)

764

src1_addr_in_bytes += z * src1_stride_z;

765

#endif // defined(MATRIX_B_DEPTH)

766

767

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

768

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

769

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

770

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

771

__global half *src_end_addr_b = src_addr_b + COLS_B;

772

773

src_addr_a += offset_row_a;

774

src_addr_b += offset_row_b;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

775

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

776

// Reset accumulators

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

half8 c00 = 0.0f;

half8 c10 = 0.0f;

half8 c20 = 0.0f;

half8 c30 = 0.0f;

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

782

for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

783

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

784

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

785

half4 a0 = vload4(0, src_addr_a);

786

half8 b0 = vload8(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

787

788

c00 += (half8)a0.s0 * b0;

789

c10 += (half8)a0.s1 * b0;

790

c20 += (half8)a0.s2 * b0;

791

c30 += (half8)a0.s3 * b0;

792

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

793

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

794

a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);

795

b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

796

797

c00 += (half8)a0.s0 * b0;

798

c10 += (half8)a0.s1 * b0;

799

c20 += (half8)a0.s2 * b0;

800

c30 += (half8)a0.s3 * b0;

801

}

802

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

803

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

804

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

805

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

806

half4 a0 = vload4(0, src_addr_a);

807

half8 b0 = vload8(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

808

809

c00 += (half8)a0.s0 * b0;

810

c10 += (half8)a0.s1 * b0;

811

c20 += (half8)a0.s2 * b0;

812

c30 += (half8)a0.s3 * b0;

813

}

814

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

815

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

816

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

817

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

818

#if defined(ALPHA)

819

// Multiply by the weight of matrix product

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

820

c00 = c00 * (half8)ALPHA;

821

c10 = c10 * (half8)ALPHA;

822

c20 = c20 * (half8)ALPHA;

823

c30 = c30 * (half8)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

824

#endif // defined(ALPHA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

825

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

826

// Compute dst address

827

__global uchar *dst_addr = offset(&dst, 0, 0);

828

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

829

#if defined(REINTERPRET_OUTPUT_AS_3D)

830

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

831

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

832

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

837

// |******************|

838

// | cross_plane_pad |

839

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

844

845

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

846

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

847

zout = min(DEPTH_GEMM3D - 1, zout);

848

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

849

// Add offset due to the cross plane paddings

850

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

851

852

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

853

// multiply dst_stride_z by DEPTH_GEMM3D

854

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

855

856

// Store 4x8 block

857

vstore8(c00, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

858

vstore8(c10, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

859

vstore8(c20, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

860

vstore8(c30, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

861

862

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

863

// Add offset for batched GEMM

864

dst_addr += z * dst_stride_z;

865

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

866

// Store 4x8 block

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

867

vstore8(c00, 0, (__global half *)(dst_addr + 0 * dst_stride_y));

868

vstore8(c10, 0, (__global half *)(dst_addr + 1 * dst_stride_y));

869

vstore8(c20, 0, (__global half *)(dst_addr + 2 * dst_stride_y));

870

vstore8(c30, 0, (__global half *)(dst_addr + 3 * dst_stride_y));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

871

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

872

}

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

873

874

/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A (src0) and matrix B (src1)

875

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication

876

*

877

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

878

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)

879

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

880

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

881

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

882

*

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

883

* @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:

884

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

885

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

886

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

887

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

888

*

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

889

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

890

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

891

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

892

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

893

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

894

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

895

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

896

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

897

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

898

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

899

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

900

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

901

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

902

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

903

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

904

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

905

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

906

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

907

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

908

*/

909

__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),

910

IMAGE_DECLARATION(src1),

911

IMAGE_DECLARATION(dst),

912

uint src0_stride_z,

913

uint src1_stride_z,

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

914

uint dst_stride_z

915

#if defined(REINTERPRET_OUTPUT_AS_3D)

916

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

917

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

918

#endif // REINTERPRET_OUTPUT_AS_3D

919

)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

920

{

921

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

922

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

923

int z = get_global_id(2);

924

925

// Offset

926

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

927

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

928

929

// src_addr_a = address of matrix A

930

// src_addr_b = address of matrix B

931

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

932

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

933

934

#if defined(MATRIX_B_DEPTH)

935

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

936

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

937

#else // defined(MATRIX_B_DEPTH)

938

src1_addr_in_bytes += z * src1_stride_z;

939

#endif // defined(MATRIX_B_DEPTH)

940

941

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

942

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

943

944

// Compute end row address for matrix B

945

__global half *src_end_addr_b = src_addr_b + COLS_B;

946

947

src_addr_a += offset_row_a;

948

src_addr_b += offset_row_b;

949

950

// Reset accumulators

half8 c00 = 0.0f;

half8 c10 = 0.0f;

half8 c20 = 0.0f;

half8 c30 = 0.0f;

#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH))

957

958

int i = 0;

959

for(; i <= (int)(COLS_MTX_B - 4); i += 4)

960

{

961

#if MULT_INTERLEAVE4X4_HEIGHT == 1

962

// Load values from matrix A (interleaved) and matrix B (transposed)

963

half8 a0 = vload8(0, src_addr_a);

964

half8 b0 = vload8(0, src_addr_b);

965

966

src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;

967

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

968

969

c00 = fma((half8)a0.s0, b0, c00);

970

c10 = fma((half8)a0.s1, b0, c10);

971

c20 = fma((half8)a0.s2, b0, c20);

972

c30 = fma((half8)a0.s3, b0, c30);

973

974

// Load values from matrix B (transposed)

975

b0 = vload8(0, src_addr_b);

976

977

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

978

979

c00 = fma((half8)a0.s4, b0, c00);

980

c10 = fma((half8)a0.s5, b0, c10);

981

c20 = fma((half8)a0.s6, b0, c20);

982

c30 = fma((half8)a0.s7, b0, c30);

983

984

// Load values from matrix A (interleaved) and matrix B (transposed)

985

a0 = vload8(0, src_addr_a);

986

b0 = vload8(0, src_addr_b);

987

988

src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;

989

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

990

991

c00 = fma((half8)a0.s0, b0, c00);

992

c10 = fma((half8)a0.s1, b0, c10);

993

c20 = fma((half8)a0.s2, b0, c20);

994

c30 = fma((half8)a0.s3, b0, c30);

995

996

// Load values from matrix B (transposed)

997

b0 = vload8(0, src_addr_b);

998

999

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

1000

1001

c00 = fma((half8)a0.s4, b0, c00);

1002

c10 = fma((half8)a0.s5, b0, c10);

1003

c20 = fma((half8)a0.s6, b0, c20);

1004

c30 = fma((half8)a0.s7, b0, c30);

1005

#else // MULT_INTERLEAVE4X4_HEIGHT == 1

1006

// Load values from matrix A (interleaved) and matrix B (transposed)

1007

half4 a0 = vload4(0, src_addr_a);

1008

half8 b0 = vload8(0, src_addr_b);

1009

1010

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

1011

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

1012

1013

c00 = fma((half8)a0.s0, b0, c00);

1014

c10 = fma((half8)a0.s1, b0, c10);

1015

c20 = fma((half8)a0.s2, b0, c20);

1016

c30 = fma((half8)a0.s3, b0, c30);

1017

1018

// Load values from matrix A (interleaved) and matrix B (transposed)

1019

a0 = vload4(0, src_addr_a);

1020

b0 = vload8(0, src_addr_b);

1021

1022

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

1023

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

1024

1025

c00 = fma((half8)a0.s0, b0, c00);

1026

c10 = fma((half8)a0.s1, b0, c10);

1027

c20 = fma((half8)a0.s2, b0, c20);

1028

c30 = fma((half8)a0.s3, b0, c30);

1029

1030

// Load values from matrix A (interleaved) and matrix B (transposed)

1031

a0 = vload4(0, src_addr_a);

1032

b0 = vload8(0, src_addr_b);

1033

1034

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

1035

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

1036

1037

c00 = fma((half8)a0.s0, b0, c00);

1038

c10 = fma((half8)a0.s1, b0, c10);

1039

c20 = fma((half8)a0.s2, b0, c20);

1040

c30 = fma((half8)a0.s3, b0, c30);

1041

1042

// Load values from matrix A (interleaved) and matrix B (transposed)

1043

a0 = vload4(0, src_addr_a);

1044

b0 = vload8(0, src_addr_b);

1045

1046

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

1047

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

1048

1049

c00 = fma((half8)a0.s0, b0, c00);

1050

c10 = fma((half8)a0.s1, b0, c10);

1051

c20 = fma((half8)a0.s2, b0, c20);

1052

c30 = fma((half8)a0.s3, b0, c30);

1053

#endif // MULT_INTERLEAVE4X4_HEIGHT == 1

1054

}

1055

1056

for(; i < (int)(COLS_MTX_B); ++i)

1057

{

1058

// Load values from matrix A (interleaved) and matrix B (transposed)

1059

half4 a0 = vload4(0, src_addr_a);

1060

half8 b0 = vload8(0, src_addr_b);

1061

1062

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

1063

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

1064

1065

c00 = fma((half8)a0.s0, b0, c00);

1066

c10 = fma((half8)a0.s1, b0, c10);

1067

c20 = fma((half8)a0.s2, b0, c20);

1068

c30 = fma((half8)a0.s3, b0, c30);

1069

}

1070

1071

// Compute destination address

1072

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1073

1074

#if defined(ALPHA)

1075

// Multiply by the weight of matrix product

1076

c00 = c00 * (half8)ALPHA;

1077

c10 = c10 * (half8)ALPHA;

1078

c20 = c20 * (half8)ALPHA;

1079

c30 = c30 * (half8)ALPHA;

1080

#endif // defined(ALPHA)

1081

1082

// Compute dst address

1083

__global uchar *dst_addr = offset(&dst, 0, 0);

1084

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1085

#if defined(REINTERPRET_OUTPUT_AS_3D)

1086

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

1087

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1088

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

1093

// |******************|

1094

// | cross_plane_pad |

1095

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1100

1101

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

1102

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

1103

zout = min(DEPTH_GEMM3D - 1, zout);

1104

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

1105

// Add offset due to the cross plane paddings

1106

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1107

1108

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1109

// multiply dst_stride_z by DEPTH_GEMM3D

1110

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1111

1112

// Store 4x8 block

1113

vstore8(c00, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

1114

vstore8(c10, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

1115

vstore8(c20, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

1116

vstore8(c30, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

1117

1118

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1119

// Add offset for batched GEMM

1120

dst_addr += z * dst_stride_z;

1121

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

1122

// Store 4x8 block

1123

vstore8(c00, 0, (__global half *)(dst_addr + 0 * dst_stride_y));

1124

vstore8(c10, 0, (__global half *)(dst_addr + 1 * dst_stride_y));

1125

vstore8(c20, 0, (__global half *)(dst_addr + 2 * dst_stride_y));

1126

vstore8(c30, 0, (__global half *)(dst_addr + 3 * dst_stride_y));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1127

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

1128

}

Georgios Pinitas

8422558

2018-05-14 12:00:05 +0100

[diff] [blame]

1129

1130

// Undefine local defines

1131

#undef COLS_MTX_B

1132

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

1133

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1134

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

1135

#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1136

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1137

#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

1138

#if defined(DATA_TYPE)

1139

#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)

Michele Di Giorgio

f6f08da

2018-04-26 10:24:30 +0100

[diff] [blame]

1140

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1141

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1142

* @note This OpenCL kernel works with floating point data types (F16/F32)

1143

* @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

1144

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1145

* @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1146

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

1147

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1148

*

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1149

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1150

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1151

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1152

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1153

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1154

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

1155

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1156

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1157

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1158

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1159

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1160

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1161

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1162

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1163

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1164

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1165

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1166

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1167

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1168

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1169

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1170

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1171

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1172

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1173

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1174

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

1175

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

1176

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1177

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

1178

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1179

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1180

__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),

1181

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1182

IMAGE_DECLARATION(dst),

1183

uint src0_stride_z,

1184

uint src1_stride_z,

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1185

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1186

#if defined(REINTERPRET_INPUT_AS_3D)

1187

,

1188

uint src_cross_plane_pad

1189

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1190

#if defined(REINTERPRET_OUTPUT_AS_3D)

1191

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1192

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1193

#endif // REINTERPRET_OUTPUT_AS_3D

1194

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1195

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1196

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1197

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1198

// Compute starting address for matrix A and Matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1199

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1200

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1201

// Update address for the matrix A

1202

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1203

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1204

// Update address for the matrix B

1205

src_addr.s1 += idx * sizeof(DATA_TYPE);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1206

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1207

#if defined(REINTERPRET_INPUT_AS_3D)

1208

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

1209

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

1215

// |******************|

1216

// | cross_plane_pad |

1217

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

1222

1223

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

1224

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

1225

zin = min(DEPTH_GEMM3D - 1, zin);

1226

1227

// Add offset due to the cross plane paddings

1228

zin *= (src_cross_plane_pad * src0_stride_y);

1229

1230

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1231

// multiply src0_stride_z by DEPTH_GEMM3D

1232

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

1233

1234

#else // defined(REINTERPRET_INPUT_AS_3D)

1235

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1236

// Add offset for batched GEMM

1237

src_addr.s0 += get_global_id(2) * src0_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1238

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1239

#endif // defined(REINTERPRET_INPUT_AS_3D)

1240

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1241

#if defined(MATRIX_B_DEPTH)

1242

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1243

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

1244

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1245

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1246

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1247

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1248

int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));

1249

1250

VECTOR_TYPE acc0 = 0.0f;

1251

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1252

VECTOR_TYPE acc1 = 0.0f;

1253

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1254

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1255

VECTOR_TYPE acc2 = 0.0f;

1256

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1257

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1258

VECTOR_TYPE acc3 = 0.0f;

1259

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1260

Georgios Pinitas

96880cf

2017-10-20 18:52:20 +0100

[diff] [blame]

1261

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1262

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1263

#if defined(REINTERPRET_INPUT_AS_3D)

1264

// Load values from matrix A

1265

VEC_DATA_TYPE(DATA_TYPE, 2)

1266

a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

1267

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1268

VEC_DATA_TYPE(DATA_TYPE, 2)

1269

a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

1270

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1271

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1272

VEC_DATA_TYPE(DATA_TYPE, 2)

1273

a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

1274

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1275

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1276

VEC_DATA_TYPE(DATA_TYPE, 2)

1277

a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

1278

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1279

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1280

// Load values from matrix A

1281

VEC_DATA_TYPE(DATA_TYPE, 2)

1282

a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1283

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1284

VEC_DATA_TYPE(DATA_TYPE, 2)

1285

a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1286

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1287

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1288

VEC_DATA_TYPE(DATA_TYPE, 2)

1289

a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1290

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1291

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1292

VEC_DATA_TYPE(DATA_TYPE, 2)

1293

a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1294

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1295

#endif // defined(REINTERPRET_INPUT_AS_3D)

1296

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1297

// Load values from matrix B

1298

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

1299

VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1300

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1301

// Accumulate

1302

acc0 += b0 * (VECTOR_TYPE)a0.s0;

1303

acc0 += b1 * (VECTOR_TYPE)a0.s1;

1304

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1305

acc1 += b0 * (VECTOR_TYPE)a1.s0;

1306

acc1 += b1 * (VECTOR_TYPE)a1.s1;

1307

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1308

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1309

acc2 += b0 * (VECTOR_TYPE)a2.s0;

1310

acc2 += b1 * (VECTOR_TYPE)a2.s1;

1311

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1312

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1313

acc3 += b0 * (VECTOR_TYPE)a3.s0;

1314

acc3 += b1 * (VECTOR_TYPE)a3.s1;

1315

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1316

}

1317

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1318

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1319

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1320

#if defined(REINTERPRET_INPUT_AS_3D)

1321

// Load values from matrix A

1322

DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

1323

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1324

DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

1325

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1326

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1327

DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

1328

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1329

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1330

DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

1331

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1332

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1333

// Load values from matrix A

1334

DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1335

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1336

DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1337

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1338

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1339

DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1340

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1341

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1342

DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1343

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1344

#endif // defined(REINTERPRET_INPUT_AS_3D)

1345

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1346

// Load values from matrix B

1347

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1348

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1349

// Accumulate

1350

acc0 += b0 * (VECTOR_TYPE)a0;

1351

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1352

acc1 += b0 * (VECTOR_TYPE)a1;

1353

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1354

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1355

acc2 += b0 * (VECTOR_TYPE)a2;

1356

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1357

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1358

acc3 += b0 * (VECTOR_TYPE)a3;

1359

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1360

}

1361

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1362

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1363

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1364

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1365

// Compute dst address

1366

__global uchar *dst_addr = offset(&dst, 0, 0);

1367

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1368

// Multiply by the weight of matrix-matrix product and store the result

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1369

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1370

acc0 = acc0 * (VECTOR_TYPE)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1371

#endif // defined(ALPHA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1372

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 && defined(ALPHA)

1373

acc1 = acc1 * (VECTOR_TYPE)ALPHA;

1374

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 && defined(ALPHA)

1375

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 && defined(ALPHA)

1376

acc2 = acc2 * (VECTOR_TYPE)ALPHA;

1377

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 && defined(ALPHA)

1378

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 && defined(ALPHA)

1379

acc3 = acc3 * (VECTOR_TYPE)ALPHA;

1380

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 && defined(ALPHA)

1381

1382

int z = get_global_id(2);

1383

1384

#if defined(REINTERPRET_OUTPUT_AS_3D)

1385

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

1386

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1387

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

1392

// |******************|

1393

// | cross_plane_pad |

1394

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1399

1400

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

1401

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

1402

zout = min(DEPTH_GEMM3D - 1, zout);

1403

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

1404

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1405

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1406

1407

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1408

// multiply dst_stride_z by DEPTH_GEMM3D

1409

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1410

1411

// Store output block

1412

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

1413

(acc0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + zout.s0));

1414

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1415

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

1416

(acc1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + zout.s1));

1417

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1418

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1419

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

1420

(acc2, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y + zout.s2));

1421

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1422

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1423

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

1424

(acc3, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y + zout.s3));

1425

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1426

1427

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1428

// Add offset for batched GEMM

1429

dst_addr += z * dst_stride_z;

1430

1431

// Store output block

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1432

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1433

(acc0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1434

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1435

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1436

(acc1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1437

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1438

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1439

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1440

(acc2, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1441

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1442

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1443

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1444

(acc3, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1445

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1446

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1447

}

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1448

#endif // defined(DATA_TYPE)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1449

Michele Di Giorgio

f6f08da

2018-04-26 10:24:30 +0100

[diff] [blame]

1450

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1451

*

1452

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

1453

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

1454

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

1455

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

1456

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1457

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

1458

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1459

*

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1460

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1461

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1462

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1463

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1464

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1465

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

1466

*

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1467

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32

1468

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1469

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1470

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1471

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1472

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1473

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

1474

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1475

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1476

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1477

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1478

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1479

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

1480

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1481

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1482

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1483

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1484

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1485

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

1486

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

1487

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1488

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

1489

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1490

*/

1491

__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),

1492

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1493

IMAGE_DECLARATION(dst),

1494

uint src0_stride_z,

1495

uint src1_stride_z,

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1496

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1497

#if defined(REINTERPRET_INPUT_AS_3D)

1498

,

1499

uint src_cross_plane_pad

1500

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1501

#if defined(REINTERPRET_OUTPUT_AS_3D)

1502

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1503

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1504

#endif // REINTERPRET_OUTPUT_AS_3D

1505

)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1506

{

1507

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

1508

1509

// Compute starting address for matrix A and matrix B

1510

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

1511

1512

// Update address for matrix A

1513

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

1514

1515

// Update address for matrix B

1516

src_addr.s1 += idx * sizeof(float);

1517

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1518

#if defined(REINTERPRET_INPUT_AS_3D)

1519

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

1520

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

1526

// |******************|

1527

// | cross_plane_pad |

1528

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

1533

1534

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

1535

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

1536

zin = min(DEPTH_GEMM3D - 1, zin);

1537

1538

// Add offset due to the cross plane paddings

1539

zin *= (src_cross_plane_pad * src0_stride_y);

1540

1541

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1542

// multiply src0_stride_z by DEPTH_GEMM3D

1543

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

1544

1545

#else // defined(REINTERPRET_INPUT_AS_3D)

1546

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1547

// Add offset for batched GEMM

1548

src_addr.s0 += get_global_id(2) * src0_stride_z;

1549

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1550

#endif // defined(REINTERPRET_INPUT_AS_3D)

1551

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1552

#if defined(MATRIX_B_DEPTH)

1553

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1554

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

1555

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1556

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1557

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1558

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1559

// Initialize accumulators

float acc00 = 0.0f;

float acc01 = 0.0f;

float acc02 = 0.0f;

float acc03 = 0.0f;

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

float acc10 = 0.0f;

float acc11 = 0.0f;

float acc12 = 0.0f;

float acc13 = 0.0f;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1571

1572

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

float acc20 = 0.0f;

float acc21 = 0.0f;

float acc22 = 0.0f;

float acc23 = 0.0f;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1578

1579

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

float acc30 = 0.0f;

float acc31 = 0.0f;

float acc32 = 0.0f;

float acc33 = 0.0f;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1585

1586

// A and B src indices get incremented at the same time.

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1587

int i = 0;

1588

for(; i <= ((int)COLS_A - 4); i += 4)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1589

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1590

#if defined(REINTERPRET_INPUT_AS_3D)

1591

// Load values from matrix A and matrix B

1592

float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

1593

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1594

float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

1595

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1596

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1597

float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

1598

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1599

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1600

float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

1601

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1602

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1603

// Load values from matrix A and matrix B

1604

float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1605

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1606

float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1607

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1608

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1609

float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1610

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1611

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1612

float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1613

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1614

#endif // defined(REINTERPRET_INPUT_AS_3D)

1615

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1616

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

1617

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1618

1619

// Multiply and accumulate

1620

acc00 = fma(a0.s0, b0.s0, acc00);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1621

acc01 = fma(a0.s0, b0.s1, acc01);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1622

acc02 = fma(a0.s0, b0.s2, acc02);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1623

acc03 = fma(a0.s0, b0.s3, acc03);

1624

1625

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1626

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1627

acc10 = fma(a1.s0, b0.s0, acc10);

1628

acc11 = fma(a1.s0, b0.s1, acc11);

1629

acc12 = fma(a1.s0, b0.s2, acc12);

1630

acc13 = fma(a1.s0, b0.s3, acc13);

1631

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1632

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1633

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1634

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1635

acc20 = fma(a2.s0, b0.s0, acc20);

1636

acc21 = fma(a2.s0, b0.s1, acc21);

1637

acc22 = fma(a2.s0, b0.s2, acc22);

1638

acc23 = fma(a2.s0, b0.s3, acc23);

1639

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1640

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1641

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1642

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1643

acc30 = fma(a3.s0, b0.s0, acc30);

1644

acc31 = fma(a3.s0, b0.s1, acc31);

1645

acc32 = fma(a3.s0, b0.s2, acc32);

1646

acc33 = fma(a3.s0, b0.s3, acc33);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1647

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1648

1649

// Load values from matrix A and matrix B

1650

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

1651

src_addr.s1 += src1_stride_y;

1652

1653

// Multiply and accumulate

1654

acc00 = fma(a0.s1, b0.s0, acc00);

1655

acc01 = fma(a0.s1, b0.s1, acc01);

1656

acc02 = fma(a0.s1, b0.s2, acc02);

1657

acc03 = fma(a0.s1, b0.s3, acc03);

1658

1659

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1660

1661

acc10 = fma(a1.s1, b0.s0, acc10);

1662

acc11 = fma(a1.s1, b0.s1, acc11);

1663

acc12 = fma(a1.s1, b0.s2, acc12);

1664

acc13 = fma(a1.s1, b0.s3, acc13);

1665

1666

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1667

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1668

1669

acc20 = fma(a2.s1, b0.s0, acc20);

1670

acc21 = fma(a2.s1, b0.s1, acc21);

1671

acc22 = fma(a2.s1, b0.s2, acc22);

1672

acc23 = fma(a2.s1, b0.s3, acc23);

1673

1674

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1675

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1676

1677

acc30 = fma(a3.s1, b0.s0, acc30);

1678

acc31 = fma(a3.s1, b0.s1, acc31);

1679

acc32 = fma(a3.s1, b0.s2, acc32);

1680

acc33 = fma(a3.s1, b0.s3, acc33);

1681

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1682

1683

// Load values from matrix A and matrix B

1684

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

1685

src_addr.s1 += src1_stride_y;

1686

1687

// Multiply and accumulate

1688

acc00 = fma(a0.s2, b0.s0, acc00);

1689

acc01 = fma(a0.s2, b0.s1, acc01);

1690

acc02 = fma(a0.s2, b0.s2, acc02);

1691

acc03 = fma(a0.s2, b0.s3, acc03);

1692

1693

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1694

1695

acc10 = fma(a1.s2, b0.s0, acc10);

1696

acc11 = fma(a1.s2, b0.s1, acc11);

1697

acc12 = fma(a1.s2, b0.s2, acc12);

1698

acc13 = fma(a1.s2, b0.s3, acc13);

1699

1700

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1701

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1702

1703

acc20 = fma(a2.s2, b0.s0, acc20);

1704

acc21 = fma(a2.s2, b0.s1, acc21);

1705

acc22 = fma(a2.s2, b0.s2, acc22);

1706

acc23 = fma(a2.s2, b0.s3, acc23);

1707

1708

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1709

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1710

1711

acc30 = fma(a3.s2, b0.s0, acc30);

1712

acc31 = fma(a3.s2, b0.s1, acc31);

1713

acc32 = fma(a3.s2, b0.s2, acc32);

1714

acc33 = fma(a3.s2, b0.s3, acc33);

1715

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1716

1717

// Load values from matrix A and matrix B

1718

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

1719

src_addr.s1 += src1_stride_y;

1720

1721

// Multiply and accumulate

1722

acc00 = fma(a0.s3, b0.s0, acc00);

1723

acc01 = fma(a0.s3, b0.s1, acc01);

1724

acc02 = fma(a0.s3, b0.s2, acc02);

1725

acc03 = fma(a0.s3, b0.s3, acc03);

1726

1727

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1728

1729

acc10 = fma(a1.s3, b0.s0, acc10);

1730

acc11 = fma(a1.s3, b0.s1, acc11);

1731

acc12 = fma(a1.s3, b0.s2, acc12);

1732

acc13 = fma(a1.s3, b0.s3, acc13);

1733

1734

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1735

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1736

1737

acc20 = fma(a2.s3, b0.s0, acc20);

1738

acc21 = fma(a2.s3, b0.s1, acc21);

1739

acc22 = fma(a2.s3, b0.s2, acc22);

1740

acc23 = fma(a2.s3, b0.s3, acc23);

1741

1742

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1743

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1744

1745

acc30 = fma(a3.s3, b0.s0, acc30);

1746

acc31 = fma(a3.s3, b0.s1, acc31);

1747

acc32 = fma(a3.s3, b0.s2, acc32);

1748

acc33 = fma(a3.s3, b0.s3, acc33);

1749

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1750

1751

src_addr.s0 += 4 * sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1752

}

1753

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1754

for(; i < (int)COLS_A; ++i)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1755

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1756

#if defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1757

// Load values from matrix A

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1758

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

1759

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1760

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

1761

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1762

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1763

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

1764

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1765

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1766

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

1767

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1768

#else // defined(REINTERPRET_INPUT_AS_3D)

1769

// Load values from matrix A

1770

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1771

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1772

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1773

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1774

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1775

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1776

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1777

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1778

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1779

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1780

#endif // defined(REINTERPRET_INPUT_AS_3D)

1781

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1782

// Load values from matrix B

1783

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1784

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1785

1786

// Multiply and accumulate

1787

acc00 = fma(a0, b0.s0, acc00);

1788

acc01 = fma(a0, b0.s1, acc01);

1789

acc02 = fma(a0, b0.s2, acc02);

1790

acc03 = fma(a0, b0.s3, acc03);

1791

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1792

acc10 = fma(a1, b0.s0, acc10);

1793

acc11 = fma(a1, b0.s1, acc11);

1794

acc12 = fma(a1, b0.s2, acc12);

1795

acc13 = fma(a1, b0.s3, acc13);

1796

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1797

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1798

acc20 = fma(a2, b0.s0, acc20);

1799

acc21 = fma(a2, b0.s1, acc21);

1800

acc22 = fma(a2, b0.s2, acc22);

1801

acc23 = fma(a2, b0.s3, acc23);

1802

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1803

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1804

acc30 = fma(a3, b0.s0, acc30);

1805

acc31 = fma(a3, b0.s1, acc31);

1806

acc32 = fma(a3, b0.s2, acc32);

1807

acc33 = fma(a3, b0.s3, acc33);

1808

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1809

1810

src_addr.s0 += sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1811

}

1812

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1813

int z = get_global_id(2);

1814

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1815

// Compute destination address

1816

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1817

1818

// Multiply by the weight of matrix-matrix product and store the result

1819

#if defined(ALPHA)

1820

acc00 = acc00 * ALPHA;

1821

acc01 = acc01 * ALPHA;

1822

acc02 = acc02 * ALPHA;

1823

acc03 = acc03 * ALPHA;

1824

#endif // defined(ALPHA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1825

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 && defined(ALPHA)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1826

acc10 = acc10 * ALPHA;

1827

acc11 = acc11 * ALPHA;

1828

acc12 = acc12 * ALPHA;

1829

acc13 = acc13 * ALPHA;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1830

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 && defined(ALPHA)

1831

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 && defined(ALPHA)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1832

acc20 = acc20 * ALPHA;

1833

acc21 = acc21 * ALPHA;

1834

acc22 = acc22 * ALPHA;

1835

acc23 = acc23 * ALPHA;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1836

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 && defined(ALPHA)

1837

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 && defined(ALPHA)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1838

acc30 = acc30 * ALPHA;

1839

acc31 = acc31 * ALPHA;

1840

acc32 = acc32 * ALPHA;

1841

acc33 = acc33 * ALPHA;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1842

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 && defined(ALPHA)

1843

1844

// Compute dst address

1845

__global uchar *dst_addr = offset(&dst, 0, 0);

1846

1847

#if defined(REINTERPRET_OUTPUT_AS_3D)

1848

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

1849

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1850

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

1855

// |******************|

1856

// | cross_plane_pad |

1857

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1862

1863

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

1864

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

1865

zout = min(DEPTH_GEMM3D - 1, zout);

1866

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

1867

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1868

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1869

1870

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1871

// multiply dst_stride_z by DEPTH_GEMM3D

1872

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1873

1874

// Store the output block

1875

vstore4((float4)(acc00, acc01, acc02, acc03), 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

1876

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1877

vstore4((float4)(acc10, acc11, acc12, acc13), 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

1878

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1879

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1880

vstore4((float4)(acc20, acc21, acc22, acc23), 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

1881

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1882

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1883

vstore4((float4)(acc30, acc31, acc32, acc33), 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1884

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1885

1886

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1887

// Add offset for batched GEMM

1888

dst_addr += z * dst_stride_z;

1889

1890

// Store the output block

1891

vstore4((float4)(acc00, acc01, acc02, acc03), 0, (__global float *)(dst_addr + 0 * dst_stride_y));

1892

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1893

vstore4((float4)(acc10, acc11, acc12, acc13), 0, (__global float *)(dst_addr + 1 * dst_stride_y));

1894

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1895

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1896

vstore4((float4)(acc20, acc21, acc22, acc23), 0, (__global float *)(dst_addr + 2 * dst_stride_y));

1897

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1898

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1899

vstore4((float4)(acc30, acc31, acc32, acc33), 0, (__global float *)(dst_addr + 3 * dst_stride_y));

1900

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1901

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1902

}

1903

1904

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

1905

*

1906

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

1907

* This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.

1908

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

1909

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.

1910

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

1911

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1912

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

1913

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1914

*

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1915

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1916

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1917

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1918

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1919

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1920

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

1921

*

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1922

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32

1923

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1924

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1925

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1926

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1927

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1928

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

1929

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1930

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1931

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1932

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1933

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1934

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

1935

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1936

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1937

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1938

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1939

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1940

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

1941

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

1942

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1943

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

1944

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1945

*/

1946

__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),

1947

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1948

IMAGE_DECLARATION(dst),

1949

uint src0_stride_z,

1950

uint src1_stride_z,

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1951

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1952

#if defined(REINTERPRET_INPUT_AS_3D)

1953

,

1954

uint src_cross_plane_pad

1955

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1956

#if defined(REINTERPRET_OUTPUT_AS_3D)

1957

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1958

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

1959

#endif // REINTERPRET_OUTPUT_AS_3D

1960

)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1961

{

1962

// Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1963

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

1964

1965

// Compute starting address for matrix A and Matrix B

1966

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

1967

1968

// Update address for the matrix A

1969

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

1970

1971

// Update address for the matrix B

1972

src_addr.s1 += idx * sizeof(float);

1973

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

1974

#if defined(REINTERPRET_INPUT_AS_3D)

1975

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

1976

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

1982

// |******************|

1983

// | cross_plane_pad |

1984

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

1989

1990

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

1991

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

1992

zin = min(DEPTH_GEMM3D - 1, zin);

1993

1994

// Add offset due to the cross plane paddings

1995

zin *= (src_cross_plane_pad * src0_stride_y);

1996

1997

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1998

// multiply src0_stride_z by DEPTH_GEMM3D

1999

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

2000

2001

#else // defined(REINTERPRET_INPUT_AS_3D)

2002

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2003

// Add offset for batched GEMM

2004

src_addr.s0 += get_global_id(2) * src0_stride_z;

2005

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2006

#endif // defined(REINTERPRET_INPUT_AS_3D)

2007

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

2008

#if defined(MATRIX_B_DEPTH)

2009

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2010

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

2011

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2012

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

2013

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2014

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2015

// Initialize accumulators

float acc00 = 0.0f;

float acc01 = 0.0f;

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2020

float acc10 = 0.0f;

2021

float acc11 = 0.0f;

2022

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2023

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2024

float acc20 = 0.0f;

2025

float acc21 = 0.0f;

2026

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2027

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2028

float acc30 = 0.0f;

2029

float acc31 = 0.0f;

2030

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2031

2032

// A and B src indices get incremented at the same time.

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2033

int i = 0;

2034

for(; i <= ((int)COLS_A - 8); i += 8)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2035

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2036

#if defined(REINTERPRET_INPUT_AS_3D)

2037

// Load values from matrix A

2038

float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0));

2039

#else // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2040

// Load values from matrix A

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2041

float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2042

#endif // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2043

2044

// Load values from matrix B

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2045

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

2046

src_addr.s1 += src1_stride_y;

2047

float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

2048

src_addr.s1 += src1_stride_y;

2049

float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

2050

src_addr.s1 += src1_stride_y;

2051

float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

2052

src_addr.s1 += src1_stride_y;

2053

float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

2054

src_addr.s1 += src1_stride_y;

2055

float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

2056

src_addr.s1 += src1_stride_y;

2057

float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

2058

src_addr.s1 += src1_stride_y;

2059

float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

2060

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2061

2062

// Multiply and accumulate

2063

acc00 = fma(a0.s0, b0.s0, acc00);

2064

acc00 = fma(a0.s1, b1.s0, acc00);

2065

acc00 = fma(a0.s2, b2.s0, acc00);

2066

acc00 = fma(a0.s3, b3.s0, acc00);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2067

acc00 = fma(a0.s4, b4.s0, acc00);

2068

acc00 = fma(a0.s5, b5.s0, acc00);

2069

acc00 = fma(a0.s6, b6.s0, acc00);

2070

acc00 = fma(a0.s7, b7.s0, acc00);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2071

2072

acc01 = fma(a0.s0, b0.s1, acc01);

2073

acc01 = fma(a0.s1, b1.s1, acc01);

2074

acc01 = fma(a0.s2, b2.s1, acc01);

2075

acc01 = fma(a0.s3, b3.s1, acc01);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2076

acc01 = fma(a0.s4, b4.s1, acc01);

2077

acc01 = fma(a0.s5, b5.s1, acc01);

2078

acc01 = fma(a0.s6, b6.s1, acc01);

2079

acc01 = fma(a0.s7, b7.s1, acc01);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2080

2081

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2082

#if defined(REINTERPRET_INPUT_AS_3D)

2083

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

2084

#else // defined(REINTERPRET_INPUT_AS_3D)

2085

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

2086

#endif // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2087

acc10 = fma(a0.s0, b0.s0, acc10);

2088

acc10 = fma(a0.s1, b1.s0, acc10);

2089

acc10 = fma(a0.s2, b2.s0, acc10);

2090

acc10 = fma(a0.s3, b3.s0, acc10);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2091

acc10 = fma(a0.s4, b4.s0, acc10);

2092

acc10 = fma(a0.s5, b5.s0, acc10);

2093

acc10 = fma(a0.s6, b6.s0, acc10);

2094

acc10 = fma(a0.s7, b7.s0, acc10);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2095

2096

acc11 = fma(a0.s0, b0.s1, acc11);

2097

acc11 = fma(a0.s1, b1.s1, acc11);

2098

acc11 = fma(a0.s2, b2.s1, acc11);

2099

acc11 = fma(a0.s3, b3.s1, acc11);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2100

acc11 = fma(a0.s4, b4.s1, acc11);

2101

acc11 = fma(a0.s5, b5.s1, acc11);

2102

acc11 = fma(a0.s6, b6.s1, acc11);

2103

acc11 = fma(a0.s7, b7.s1, acc11);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2104

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2105

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2106

#if defined(REINTERPRET_INPUT_AS_3D)

2107

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

2108

#else // defined(REINTERPRET_INPUT_AS_3D)

2109

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

2110

#endif // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2111

acc20 = fma(a0.s0, b0.s0, acc20);

2112

acc20 = fma(a0.s1, b1.s0, acc20);

2113

acc20 = fma(a0.s2, b2.s0, acc20);

2114

acc20 = fma(a0.s3, b3.s0, acc20);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2115

acc20 = fma(a0.s4, b4.s0, acc20);

2116

acc20 = fma(a0.s5, b5.s0, acc20);

2117

acc20 = fma(a0.s6, b6.s0, acc20);

2118

acc20 = fma(a0.s7, b7.s0, acc20);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2119

2120

acc21 = fma(a0.s0, b0.s1, acc21);

2121

acc21 = fma(a0.s1, b1.s1, acc21);

2122

acc21 = fma(a0.s2, b2.s1, acc21);

2123

acc21 = fma(a0.s3, b3.s1, acc21);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2124

acc21 = fma(a0.s4, b4.s1, acc21);

2125

acc21 = fma(a0.s5, b5.s1, acc21);

2126

acc21 = fma(a0.s6, b6.s1, acc21);

2127

acc21 = fma(a0.s7, b7.s1, acc21);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2128

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2129

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2130

#if defined(REINTERPRET_INPUT_AS_3D)

2131

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

2132

#else // defined(REINTERPRET_INPUT_AS_3D)

2133

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

2134

#endif // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2135

acc30 = fma(a0.s0, b0.s0, acc30);

2136

acc30 = fma(a0.s1, b1.s0, acc30);

2137

acc30 = fma(a0.s2, b2.s0, acc30);

2138

acc30 = fma(a0.s3, b3.s0, acc30);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2139

acc30 = fma(a0.s4, b4.s0, acc30);

2140

acc30 = fma(a0.s5, b5.s0, acc30);

2141

acc30 = fma(a0.s6, b6.s0, acc30);

2142

acc30 = fma(a0.s7, b7.s0, acc30);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2143

2144

acc31 = fma(a0.s0, b0.s1, acc31);

2145

acc31 = fma(a0.s1, b1.s1, acc31);

2146

acc31 = fma(a0.s2, b2.s1, acc31);

2147

acc31 = fma(a0.s3, b3.s1, acc31);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2148

acc31 = fma(a0.s4, b4.s1, acc31);

2149

acc31 = fma(a0.s5, b5.s1, acc31);

2150

acc31 = fma(a0.s6, b6.s1, acc31);

2151

acc31 = fma(a0.s7, b7.s1, acc31);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2152

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2153

2154

src_addr.s0 += sizeof(float) * 8;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2155

}

2156

// float size increment

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2157

for(; i < (int)COLS_A; ++i)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2158

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2159

#if defined(REINTERPRET_INPUT_AS_3D)

2160

// Load values from matrix A

2161

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

2162

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2163

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

2164

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2165

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2166

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

2167

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2168

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2169

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

2170

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2171

#else // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2172

// Load values from matrix A

2173

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

2174

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2175

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

2176

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2177

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2178

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

2179

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2180

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2181

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

2182

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2183

#endif // defined(REINTERPRET_INPUT_AS_3D)

2184

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2185

// Load values from matrix B

2186

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2187

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2188

2189

// Multiply and accumulate

2190

acc00 = fma(a0, b0.s0, acc00);

2191

acc01 = fma(a0, b0.s1, acc01);

2192

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2193

acc10 = fma(a1, b0.s0, acc10);

2194

acc11 = fma(a1, b0.s1, acc11);

2195

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2196

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2197

acc20 = fma(a2, b0.s0, acc20);

2198

acc21 = fma(a2, b0.s1, acc21);

2199

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2200

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2201

acc30 = fma(a3, b0.s0, acc30);

2202

acc31 = fma(a3, b0.s1, acc31);

2203

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2204

2205

src_addr.s0 += sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2206

}

2207

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2208

// Multiply by the weight of matrix-matrix product and store the result

2209

#if defined(ALPHA)

2210

acc00 = acc00 * ALPHA;

2211

acc01 = acc01 * ALPHA;

2212

#endif // defined(ALPHA)

2213

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 && defined(ALPHA)

2214

acc10 = acc10 * ALPHA;

2215

acc11 = acc11 * ALPHA;

2216

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 && defined(ALPHA)

2217

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 && defined(ALPHA)

2218

acc20 = acc20 * ALPHA;

2219

acc21 = acc21 * ALPHA;

2220

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 && defined(ALPHA)

2221

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 && defined(ALPHA)

2222

acc30 = acc30 * ALPHA;

2223

acc31 = acc31 * ALPHA;

2224

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 && defined(ALPHA)

2225

2226

int z = get_global_id(2);

2227

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2228

// Compute destination address

2229

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2230

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2231

// Compute dst address

2232

__global uchar *dst_addr = offset(&dst, 0, 0);

2233

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2234

#if defined(REINTERPRET_OUTPUT_AS_3D)

2235

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

2236

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2237

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

2242

// |******************|

2243

// | cross_plane_pad |

2244

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2249

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2250

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

2251

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

2252

zout = min(DEPTH_GEMM3D - 1, zout);

2253

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

2254

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2255

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2256

2257

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2258

// multiply dst_stride_z by DEPTH_GEMM3D

2259

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

2260

2261

// Store the output block

2262

vstore2((float2)(acc00, acc01), 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2263

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2264

vstore2((float2)(acc10, acc11), 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2265

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2266

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2267

vstore2((float2)(acc20, acc21), 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2268

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2269

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2270

vstore2((float2)(acc30, acc31), 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2271

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2272

2273

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2274

// Add offset for batched GEMM

2275

dst_addr += z * dst_stride_z;

2276

2277

// Store the output block

2278

vstore2((float2)(acc00, acc01), 0, (__global float *)(dst_addr + 0 * dst_stride_y));

2279

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2280

vstore2((float2)(acc10, acc11), 0, (__global float *)(dst_addr + 1 * dst_stride_y));

2281

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2282

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2283

vstore2((float2)(acc20, acc21), 0, (__global float *)(dst_addr + 2 * dst_stride_y));

2284

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2285

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2286

vstore2((float2)(acc30, acc31), 0, (__global float *)(dst_addr + 3 * dst_stride_y));

2287

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2288

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2289

}

2290

Vidhya Sudhan Loganathan

bdff491

2018-05-22 15:03:09 +0100

[diff] [blame]

2291

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2292

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

2293

*

2294

* @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.

2295

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

2296

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

2297

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

2298

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

2299

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

2300

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

2301

*

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2302

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

2303

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2304

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2305

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2306

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2307

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

2308

*

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2309

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

2310

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

2311

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2312

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

2313

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2314

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

2315

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

2316

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

2317

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2318

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

2319

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2320

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

2321

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

2322

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2323

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2324

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2325

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2326

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2327

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

2328

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

2329

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2330

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

2331

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2332

*/

2333

__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),

2334

IMAGE_DECLARATION(src1),

2335

IMAGE_DECLARATION(dst),

2336

uint src0_stride_z,

2337

uint src1_stride_z,

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2338

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2339

#if defined(REINTERPRET_INPUT_AS_3D)

2340

,

2341

uint src_cross_plane_pad

2342

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2343

#if defined(REINTERPRET_OUTPUT_AS_3D)

2344

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2345

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2346

#endif // REINTERPRET_OUTPUT_AS_3D

2347

)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2348

{

2349

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

2350

2351

// Compute starting address for matrix A and Matrix B

2352

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

2353

2354

// Update address for the matrix A

2355

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

2356

2357

// Update address for the matrix B

2358

src_addr.s1 += idx * sizeof(half);

2359

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2360

#if defined(REINTERPRET_INPUT_AS_3D)

2361

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

2362

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

2368

// |******************|

2369

// | cross_plane_pad |

2370

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

2375

2376

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

2377

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

2378

zin = min(DEPTH_GEMM3D - 1, zin);

2379

2380

// Add offset due to the cross plane paddings

2381

zin *= (src_cross_plane_pad * src0_stride_y);

2382

2383

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2384

// multiply src0_stride_z by DEPTH_GEMM3D

2385

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

2386

2387

#else // defined(REINTERPRET_INPUT_AS_3D)

2388

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2389

// Add offset for batched GEMM

2390

src_addr.s0 += get_global_id(2) * src0_stride_z;

2391

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2392

#endif // defined(REINTERPRET_INPUT_AS_3D)

2393

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2394

#if defined(MATRIX_B_DEPTH)

2395

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2396

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

2397

#else // defined(MATRIX_B_DEPTH)

2398

src_addr.s1 += get_global_id(2) * src1_stride_z;

2399

#endif // defined(MATRIX_B_DEPTH)

2400

2401

half8 acc0 = 0.0h;

2402

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2403

half8 acc1 = 0.0h;

2404

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2405

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2406

half8 acc2 = 0.0h;

2407

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2408

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2409

half8 acc3 = 0.0h;

2410

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2411

2412

int i = 0;

2413

for(; i <= ((int)COLS_A - 4); i += 4)

2414

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2415

#if defined(REINTERPRET_INPUT_AS_3D)

2416

// Load values from matrix A

2417

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

2418

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2419

half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

2420

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2421

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2422

half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

2423

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2424

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2425

half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

2426

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2427

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2428

// Load values from matrix A

2429

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

2430

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2431

half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

2432

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2433

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2434

half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

2435

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2436

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2437

half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

2438

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2439

#endif // defined(REINTERPRET_INPUT_AS_3D)

2440

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2441

// Load values from matrix B

2442

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

2443

src_addr.s1 += src1_stride_y;

2444

2445

// Accumulate

2446

acc0 = fma(b0, (half8)a0.s0, acc0);

2447

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2448

acc1 = fma(b0, (half8)a1.s0, acc1);

2449

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2450

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2451

acc2 = fma(b0, (half8)a2.s0, acc2);

2452

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2453

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2454

acc3 = fma(b0, (half8)a3.s0, acc3);

2455

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2456

2457

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

2458

src_addr.s1 += src1_stride_y;

2459

acc0 = fma(b0, (half8)a0.s1, acc0);

2460

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2461

acc1 = fma(b0, (half8)a1.s1, acc1);

2462

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2463

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2464

acc2 = fma(b0, (half8)a2.s1, acc2);

2465

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2466

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2467

acc3 = fma(b0, (half8)a3.s1, acc3);

2468

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2469

2470

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

2471

src_addr.s1 += src1_stride_y;

2472

acc0 = fma(b0, (half8)a0.s2, acc0);

2473

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2474

acc1 = fma(b0, (half8)a1.s2, acc1);

2475

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2476

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2477

acc2 = fma(b0, (half8)a2.s2, acc2);

2478

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2479

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2480

acc3 = fma(b0, (half8)a3.s2, acc3);

2481

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2482

2483

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

2484

src_addr.s1 += src1_stride_y;

2485

acc0 = fma(b0, (half8)a0.s3, acc0);

2486

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2487

acc1 = fma(b0, (half8)a1.s3, acc1);

2488

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2489

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2490

acc2 = fma(b0, (half8)a2.s3, acc2);

2491

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2492

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2493

acc3 = fma(b0, (half8)a3.s3, acc3);

2494

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2495

2496

src_addr.s0 += 4 * sizeof(half);

2497

}

2498

2499

for(; i < (int)COLS_A; ++i)

2500

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2501

#if defined(REINTERPRET_INPUT_AS_3D)

2502

// Load values from matrix A

2503

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

2504

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2505

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

2506

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2507

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2508

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

2509

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2510

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2511

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

2512

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2513

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2514

// Load values from matrix A

2515

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

2516

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2517

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

2518

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2519

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2520

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

2521

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2522

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2523

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

2524

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2525

#endif // defined(REINTERPRET_INPUT_AS_3D)

2526

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2527

// Load values from matrix B

2528

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

2529

2530

src_addr += (int2)(sizeof(half), src1_stride_y);

2531

2532

// Accumulate

2533

acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;

2534

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2535

acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;

2536

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2537

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2538

acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;

2539

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2540

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2541

acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;

2542

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2543

}

2544

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2545

// Multiply by the weight of matrix-matrix product and store the result

2546

#if defined(ALPHA)

2547

acc0 = acc0 * (half8)ALPHA;

2548

#endif // defined(ALPHA)

2549

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 && defined(ALPHA)

2550

acc1 = acc1 * (half8)ALPHA;

2551

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 && defined(ALPHA)

2552

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 && defined(ALPHA)

2553

acc2 = acc2 * (half8)ALPHA;

2554

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 && defined(ALPHA)

2555

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 && defined(ALPHA)

2556

acc3 = acc3 * (half8)ALPHA;

2557

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 && defined(ALPHA)

2558

2559

int z = get_global_id(2);

2560

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2561

// Compute destination address

2562

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2563

2564

// Compute dst address

2565

__global uchar *dst_addr = offset(&dst, 0, 0);

2566

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2567

#if defined(REINTERPRET_OUTPUT_AS_3D)

2568

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

2569

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2570

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

2575

// |******************|

2576

// | cross_plane_pad |

2577

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2582

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2583

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

2584

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

2585

zout = min(DEPTH_GEMM3D - 1, zout);

2586

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

2587

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2588

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2589

2590

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2591

// multiply dst_stride_z by DEPTH_GEMM3D

2592

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

2593

2594

// Store the output block

2595

vstore8(acc0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

2596

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2597

vstore8(acc1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

2598

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2599

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2600

vstore8(acc2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

2601

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2602

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2603

vstore8(acc3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

2604

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2605

2606

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2607

// Add offset for batched GEMM

2608

dst_addr += z * dst_stride_z;

2609

2610

// Store the output block

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2611

vstore8(acc0, 0, (__global half *)(dst_addr + 0 * dst_stride_y));

2612

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2613

vstore8(acc1, 0, (__global half *)(dst_addr + 1 * dst_stride_y));

2614

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2615

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2616

vstore8(acc2, 0, (__global half *)(dst_addr + 2 * dst_stride_y));

2617

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2618

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2619

vstore8(acc3, 0, (__global half *)(dst_addr + 3 * dst_stride_y));

2620

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2621

#endif // REINTERPRET_OUTPUT_AS_3D

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2622

}

Vidhya Sudhan Loganathan

bdff491

2018-05-22 15:03:09 +0100

[diff] [blame]

2623

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

2624

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2625

#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2626

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2627

#if defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2628

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

2629

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

2630

* @note The beta's value need to be passed at compile time using -DBETA

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2631

*

2632

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F32

2633

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

2634

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2635

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

2636

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2637

* @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)

2638

* @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2639

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2640

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2641

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2642

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2643

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2644

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2645

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2646

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2647

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2648

*/

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2649

__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),

2650

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2651

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2652

// Compute source and destination addresses

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2653

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

2654

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2655

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2656

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2657

float4 alpha_ab = vload4(0, (__global float *)dst.ptr);

2658

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2659

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2660

float4 c = vload4(0, (__global float *)src.ptr);

2661

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2662

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2663

float4 out = alpha_ab + (float4)BETA * c;

2664

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2665

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2666

vstore4(out, 0, (__global float *)dst.ptr);

2667

}

2668

Vidhya Sudhan Loganathan

76c8564

2018-05-25 13:53:02 +0100

[diff] [blame]

2669

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2670

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

2671

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

2672

* @note The beta's value need to be passed at compile time using -DBETA

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2673

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2674

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F16

2675

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

2676

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2677

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

2678

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2679

* @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)

2680

* @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2681

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2682

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2683

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2684

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2685

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2686

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2687

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2688

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2689

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2690

*/

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2691

__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),

2692

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2693

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2694

// Compute source and destination addresses

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2695

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

2696

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2697

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2698

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2699

half8 alpha_ab = vload8(0, (__global half *)dst.ptr);

2700

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2701

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2702

half8 c = vload8(0, (__global half *)src.ptr);

2703

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2704

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2705

half8 out = alpha_ab + (half8)BETA * c;

2706

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2707

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2708

vstore8(out, 0, (__global half *)dst.ptr);

2709

}

Vidhya Sudhan Loganathan

76c8564

2018-05-25 13:53:02 +0100

[diff] [blame]

2710

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2711

#endif // defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2712

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2713

#if defined(WIDTH_VECTOR_A)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2714

/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer

2715

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

2716

* @note The width of A need to be passed at compile time using -DWIDTH_VECTOR_A

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2717

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

2718

* @note The input A and matrix B must not be reshaped

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2719

*

2720

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

2721

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

2722

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2723

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

2724

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2725

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2726

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2727

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

2728

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2729

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

2730

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2731

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

2732

* @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2733

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2734

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2735

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2736

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2737

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2738

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2739

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2740

*/

2741

__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),

2742

TENSOR3D_DECLARATION(src1),

2743

IMAGE_DECLARATION(dst))

2744

{

2745

int idx = get_global_id(0) * 4;

2746

int idy = get_global_id(1);

2747

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2748

// Compute the address for the vector A and matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2749

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));

2750

src_addr.s1 += idx * sizeof(float);

2751

2752

int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));

float4 acc = 0.0f;

Georgios Pinitas

2017-10-20 18:52:20 +0100

[diff] [blame]

2756

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2757

{

2758

float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));

2759

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

2760

float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));

2761

2762

acc += b0 * (float4)a0.s0;

2763

acc += b1 * (float4)a0.s1;

2764

}

2765

2766

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))

2767

{

2768

float a0 = *((__global float *)(src0_ptr + src_addr.s0));

2769

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

2770

2771

acc += b0 * (float4)a0;

2772

}

2773

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2774

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2775

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2776

2777

vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));

2778

}

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2779

#endif // defined(WIDTH_VECTOR_A)

2780

2781

/** This kernel accumulates each row with the biases vector.

2782

*

2783

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=short.

2784

* @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=16.

2785

*

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

2786

* @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: U8/S8/U16/S16/F16/U32/S32/F32

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2787

* @param[in] accum_stride_x Stride of the accmulate tensor in X dimension (in bytes)

2788

* @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes)

2789

* @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes)

2790

* @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2791

* @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor

2792

* @param[in] biases_ptr Pointer to the biases vector. Same as @p accum_ptr

2793

* @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes)

2794

* @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2795

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor

2796

*/

2797

#if defined(DATA_TYPE) && defined(VECTOR_SIZE)

2798

__kernel void gemm_accumulate_biases(

2799

IMAGE_DECLARATION(accum),

2800

VECTOR_DECLARATION(biases))

2801

{

2802

Image accum = CONVERT_TO_IMAGE_STRUCT(accum);

2803

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

2804

2805

// Vector size, i.e. number of vector elements.

2806

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

2807

accum_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)accum.ptr);

2808

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

2809

biases_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)biases.ptr);

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

2810

accum_value = biases_value + accum_value;

Anton Lokhmotov