Blame - src/core/CL/cl_kernels/gemm.cl - ml/ComputeLibrary

2018-01-12 10:21:40 +0000

[diff] [blame]

46

*

47

* @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

48

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

49

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

50

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

51

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

52

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

53

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

54

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

55

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

56

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

57

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

58

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

59

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

60

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

61

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

62

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

63

*/

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

64

__kernel void gemm_transpose1xW(TENSOR3D_DECLARATION(src),

65

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

66

{

67

uint x = get_global_id(0);

68

uint y = get_global_id(1);

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

69

uint z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

70

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

71

// Compute address for Matrix B - source

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

72

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

73

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

74

// Compute address for Matrix B transposed - destination. X and Y are swapped

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

75

uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + y * TRANSPOSE_W * sizeof(DATA_TYPE) * MULT_TRANSPOSE1XW_WIDTH + (x / MULT_TRANSPOSE1XW_WIDTH) * dst_stride_y +

76

(x % MULT_TRANSPOSE1XW_WIDTH) * TRANSPOSE_W * sizeof(DATA_TYPE);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

77

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

78

// Add offset for batched GEMM

79

dst_addr_in_bytes += z * dst_stride_z;

80

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

81

VEC_DATA_TYPE(DATA_TYPE, TRANSPOSE_W)

82

b0 = VLOAD(TRANSPOSE_W)(0, (__global DATA_TYPE *)src.ptr);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

83

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

84

VSTORE(TRANSPOSE_W)

85

(b0, 0, (__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

86

}

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

87

#endif // defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

88

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

89

#if defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)

90

91

/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

92

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

93

* @note The data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=float)

94

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

95

*

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

96

* @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

97

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

98

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

99

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

100

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

101

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

102

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

103

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

104

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

105

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

106

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

107

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

108

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

109

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

110

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

111

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

112

*/

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

113

__kernel void gemm_interleave4x4(TENSOR3D_DECLARATION(src),

114

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

115

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

116

// Compute source and destination addresses

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

117

uint x = get_global_id(0);

118

uint y = get_global_id(1);

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

119

uint z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

120

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

121

// Compute address for source tensor

122

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

123

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

124

// Compute address for Matrix B transposed - destination. X and Y are swapped

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

125

uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * 16 * MULT_INTERLEAVE4X4_HEIGHT + (y / MULT_INTERLEAVE4X4_HEIGHT) * dst_stride_y +

126

(y % MULT_INTERLEAVE4X4_HEIGHT) * 4 * sizeof(DATA_TYPE);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

127

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

128

// Add offset for batched GEMM

129

dst_addr_in_bytes += z * dst_stride_z;

130

131

__global uchar *input_ptr = src.ptr;

132

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

133

// Load values from Matrix A

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

134

VEC_DATA_TYPE(DATA_TYPE, 4)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

135

a0 = vload4(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

136

VEC_DATA_TYPE(DATA_TYPE, 4)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

137

a1 = vload4(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

138

VEC_DATA_TYPE(DATA_TYPE, 4)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

139

a2 = vload4(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

140

VEC_DATA_TYPE(DATA_TYPE, 4)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

141

a3 = vload4(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

142

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

143

VEC_DATA_TYPE(DATA_TYPE, 4)

144

val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s0, a1.s0, a2.s0, a3.s0);

145

vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 0 * MULT_INTERLEAVE4X4_HEIGHT));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

146

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

147

val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s1, a1.s1, a2.s1, a3.s1);

148

vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 4 * MULT_INTERLEAVE4X4_HEIGHT));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

149

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

150

val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s2, a1.s2, a2.s2, a3.s2);

151

vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 8 * MULT_INTERLEAVE4X4_HEIGHT));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

152

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

153

val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s3, a1.s3, a2.s3, a3.s3);

154

vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 12 * MULT_INTERLEAVE4X4_HEIGHT));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

155

}

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

156

#endif // defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

157

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

158

#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

159

/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

160

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

161

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

162

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

163

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)

164

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

165

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

166

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

167

*

168

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

169

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

170

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

171

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

172

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

173

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

174

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

175

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

176

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

177

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

178

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

179

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

180

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

181

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

182

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

183

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

184

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

185

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

186

*/

Gian Marco Iodice

bb36a8e

2018-04-19 12:05:08 +0100

[diff] [blame]

187

__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),

188

IMAGE_DECLARATION(src1),

189

IMAGE_DECLARATION(dst),

190

uint src0_stride_z,

191

uint src1_stride_z,

192

uint dst_stride_z)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

193

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

194

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

195

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

196

int z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

197

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

198

// Offset

199

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

200

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

201

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

202

// src_addr_a = address of matrix A

203

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

204

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

205

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

206

207

#if defined(MATRIX_B_DEPTH)

208

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

209

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

210

#else // defined(MATRIX_B_DEPTH)

211

src1_addr_in_bytes += z * src1_stride_z;

212

#endif // defined(MATRIX_B_DEPTH)

213

214

__global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);

215

__global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

216

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

217

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

218

__global float *src_end_addr_b = src_addr_b + COLS_B;

219

220

src_addr_a += offset_row_a;

221

src_addr_b += offset_row_b;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

222

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

223

// Reset accumulators

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

float4 c00 = 0.0f;

float4 c10 = 0.0f;

float4 c20 = 0.0f;

float4 c30 = 0.0f;

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

229

for(; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

230

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

231

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

232

float4 a0 = vload4(0, src_addr_a);

233

float4 b0 = vload4(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

234

235

c00 += (float4)a0.s0 * b0;

236

c10 += (float4)a0.s1 * b0;

237

c20 += (float4)a0.s2 * b0;

238

c30 += (float4)a0.s3 * b0;

239

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

240

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

241

a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);

242

b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

243

244

c00 += (float4)a0.s0 * b0;

245

c10 += (float4)a0.s1 * b0;

246

c20 += (float4)a0.s2 * b0;

247

c30 += (float4)a0.s3 * b0;

248

}

249

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

250

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

251

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

252

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

253

float4 a0 = vload4(0, src_addr_a);

254

float4 b0 = vload4(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

255

256

c00 += (float4)a0.s0 * b0;

257

c10 += (float4)a0.s1 * b0;

258

c20 += (float4)a0.s2 * b0;

259

c30 += (float4)a0.s3 * b0;

260

}

261

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

262

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

263

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

264

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

265

#if defined(ALPHA)

266

// Multiply by the weight of matrix product

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

267

c00 = c00 * (float4)ALPHA;

268

c10 = c10 * (float4)ALPHA;

269

c20 = c20 * (float4)ALPHA;

270

c30 = c30 * (float4)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

271

#endif // defined(ALPHA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

272

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

273

// Compute dst address

274

__global uchar *dst_addr = offset(&dst, 0, 0);

275

276

// Add offset for batched GEMM

277

dst_addr += z * dst_stride_z;

278

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

279

// Store 4x4 block

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

280

vstore4(c00, 0, (__global float *)(dst_addr + 0 * dst_stride_y));

281

vstore4(c10, 0, (__global float *)(dst_addr + 1 * dst_stride_y));

282

vstore4(c20, 0, (__global float *)(dst_addr + 2 * dst_stride_y));

283

vstore4(c30, 0, (__global float *)(dst_addr + 3 * dst_stride_y));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

284

}

285

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

286

/** This OpenCL kernel is optimized for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

287

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

288

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

289

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

290

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)

291

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

292

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

293

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

294

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

295

*

296

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

297

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

298

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

299

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

300

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

301

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

302

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

303

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

304

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

305

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

306

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

307

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

308

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

309

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

310

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

311

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

312

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

313

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

314

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

315

__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),

316

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

317

IMAGE_DECLARATION(dst),

318

uint src0_stride_z,

319

uint src1_stride_z,

320

uint dst_stride_z)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

321

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

322

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

323

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

324

int z = get_global_id(2);

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

325

326

// Offset

327

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

328

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;

329

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

330

// src_addr_a = address of matrix A

331

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

332

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

333

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

334

335

#if defined(MATRIX_B_DEPTH)

336

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

337

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

338

#else // defined(MATRIX_B_DEPTH)

339

src1_addr_in_bytes += z * src1_stride_z;

340

#endif // defined(MATRIX_B_DEPTH)

341

342

__global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);

343

__global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

344

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

345

src_addr_a += offset_row_a;

346

src_addr_b += offset_row_b;

347

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

348

// Reset accumulators

float c00 = 0.0f;

float c01 = 0.0f;

float c02 = 0.0f;

float c03 = 0.0f;

float c10 = 0.0f;

float c11 = 0.0f;

float c12 = 0.0f;

float c13 = 0.0f;

float c20 = 0.0f;

float c21 = 0.0f;

float c22 = 0.0f;

float c23 = 0.0f;

float c30 = 0.0f;

float c31 = 0.0f;

float c32 = 0.0f;

float c33 = 0.0f;

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

366

#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH))

367

368

int i = 0;

369

for(; i <= (int)(COLS_MTX_B - 4); i += 4)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

370

{

371

// Load values from matrix A (interleaved) and matrix B (transposed)

372

float4 a0 = vload4(0, src_addr_a);

373

float4 b0 = vload4(0, src_addr_b);

374

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

375

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

376

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

377

378

c00 = fma(a0.s0, b0.s0, c00);

379

c01 = fma(a0.s0, b0.s1, c01);

380

c02 = fma(a0.s0, b0.s2, c02);

381

c03 = fma(a0.s0, b0.s3, c03);

382

383

c10 = fma(a0.s1, b0.s0, c10);

384

c11 = fma(a0.s1, b0.s1, c11);

385

c12 = fma(a0.s1, b0.s2, c12);

386

c13 = fma(a0.s1, b0.s3, c13);

387

388

c20 = fma(a0.s2, b0.s0, c20);

389

c21 = fma(a0.s2, b0.s1, c21);

390

c22 = fma(a0.s2, b0.s2, c22);

391

c23 = fma(a0.s2, b0.s3, c23);

392

393

c30 = fma(a0.s3, b0.s0, c30);

394

c31 = fma(a0.s3, b0.s1, c31);

395

c32 = fma(a0.s3, b0.s2, c32);

396

c33 = fma(a0.s3, b0.s3, c33);

397

398

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

399

a0 = vload4(0, src_addr_a);

400

b0 = vload4(0, src_addr_b);

401

402

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

403

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

404

405

c00 = fma(a0.s0, b0.s0, c00);

406

c01 = fma(a0.s0, b0.s1, c01);

407

c02 = fma(a0.s0, b0.s2, c02);

408

c03 = fma(a0.s0, b0.s3, c03);

409

410

c10 = fma(a0.s1, b0.s0, c10);

411

c11 = fma(a0.s1, b0.s1, c11);

412

c12 = fma(a0.s1, b0.s2, c12);

413

c13 = fma(a0.s1, b0.s3, c13);

414

415

c20 = fma(a0.s2, b0.s0, c20);

416

c21 = fma(a0.s2, b0.s1, c21);

417

c22 = fma(a0.s2, b0.s2, c22);

418

c23 = fma(a0.s2, b0.s3, c23);

419

420

c30 = fma(a0.s3, b0.s0, c30);

421

c31 = fma(a0.s3, b0.s1, c31);

422

c32 = fma(a0.s3, b0.s2, c32);

423

c33 = fma(a0.s3, b0.s3, c33);

424

425

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

426

a0 = vload4(0, src_addr_a);

427

b0 = vload4(0, src_addr_b);

428

429

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

430

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

431

432

c00 = fma(a0.s0, b0.s0, c00);

433

c01 = fma(a0.s0, b0.s1, c01);

434

c02 = fma(a0.s0, b0.s2, c02);

435

c03 = fma(a0.s0, b0.s3, c03);

436

437

c10 = fma(a0.s1, b0.s0, c10);

438

c11 = fma(a0.s1, b0.s1, c11);

439

c12 = fma(a0.s1, b0.s2, c12);

440

c13 = fma(a0.s1, b0.s3, c13);

441

442

c20 = fma(a0.s2, b0.s0, c20);

443

c21 = fma(a0.s2, b0.s1, c21);

444

c22 = fma(a0.s2, b0.s2, c22);

445

c23 = fma(a0.s2, b0.s3, c23);

446

447

c30 = fma(a0.s3, b0.s0, c30);

448

c31 = fma(a0.s3, b0.s1, c31);

449

c32 = fma(a0.s3, b0.s2, c32);

450

c33 = fma(a0.s3, b0.s3, c33);

451

452

// Load values from matrix A (interleaved) and matrix B (transposed)

453

a0 = vload4(0, src_addr_a);

454

b0 = vload4(0, src_addr_b);

455

456

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

457

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

458

459

c00 = fma(a0.s0, b0.s0, c00);

460

c01 = fma(a0.s0, b0.s1, c01);

461

c02 = fma(a0.s0, b0.s2, c02);

462

c03 = fma(a0.s0, b0.s3, c03);

463

464

c10 = fma(a0.s1, b0.s0, c10);

465

c11 = fma(a0.s1, b0.s1, c11);

466

c12 = fma(a0.s1, b0.s2, c12);

467

c13 = fma(a0.s1, b0.s3, c13);

468

469

c20 = fma(a0.s2, b0.s0, c20);

470

c21 = fma(a0.s2, b0.s1, c21);

471

c22 = fma(a0.s2, b0.s2, c22);

472

c23 = fma(a0.s2, b0.s3, c23);

473

474

c30 = fma(a0.s3, b0.s0, c30);

475

c31 = fma(a0.s3, b0.s1, c31);

476

c32 = fma(a0.s3, b0.s2, c32);

477

c33 = fma(a0.s3, b0.s3, c33);

478

}

479

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

480

for(; i < (int)(COLS_MTX_B); ++i)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

481

{

482

// Load values from matrix A (interleaved) and matrix B (transposed)

483

float4 a0 = vload4(0, src_addr_a);

484

float4 b0 = vload4(0, src_addr_b);

485

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

486

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

487

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

488

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

489

c00 = fma(a0.s0, b0.s0, c00);

490

c01 = fma(a0.s0, b0.s1, c01);

491

c02 = fma(a0.s0, b0.s2, c02);

492

c03 = fma(a0.s0, b0.s3, c03);

493

494

c10 = fma(a0.s1, b0.s0, c10);

495

c11 = fma(a0.s1, b0.s1, c11);

496

c12 = fma(a0.s1, b0.s2, c12);

497

c13 = fma(a0.s1, b0.s3, c13);

498

499

c20 = fma(a0.s2, b0.s0, c20);

500

c21 = fma(a0.s2, b0.s1, c21);

501

c22 = fma(a0.s2, b0.s2, c22);

502

c23 = fma(a0.s2, b0.s3, c23);

503

504

c30 = fma(a0.s3, b0.s0, c30);

505

c31 = fma(a0.s3, b0.s1, c31);

506

c32 = fma(a0.s3, b0.s2, c32);

507

c33 = fma(a0.s3, b0.s3, c33);

508

}

509

510

// Compute destination address

511

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

512

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

513

#if defined(ALPHA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

514

// Multiply by the weight of matrix product

c00 = c00 * ALPHA;

c01 = c01 * ALPHA;

c02 = c02 * ALPHA;

c03 = c03 * ALPHA;

c10 = c10 * ALPHA;

c11 = c11 * ALPHA;

c12 = c12 * ALPHA;

c13 = c13 * ALPHA;

c20 = c20 * ALPHA;

c21 = c21 * ALPHA;

c22 = c22 * ALPHA;

c23 = c23 * ALPHA;

c30 = c30 * ALPHA;

c31 = c31 * ALPHA;

c32 = c32 * ALPHA;

c33 = c33 * ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

531

#endif // defined(ALPHA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

532

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

533

// Compute dst address

534

__global uchar *dst_addr = offset(&dst, 0, 0);

535

536

// Add offset for batched GEMM

537

dst_addr += z * dst_stride_z;

538

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

539

// Store 4x4 block

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

540

vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(dst_addr + 0 * dst_stride_y));

541

vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(dst_addr + 1 * dst_stride_y));

542

vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(dst_addr + 2 * dst_stride_y));

543

vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(dst_addr + 3 * dst_stride_y));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

544

}

545

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

546

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

547

/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

548

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

549

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

550

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

551

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)

552

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

553

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

554

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

555

*

556

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

557

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

558

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

559

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

560

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

561

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

562

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

563

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

564

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

565

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

566

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

567

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

568

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

569

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

570

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

571

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

572

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

573

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

574

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

575

__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),

576

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

577

IMAGE_DECLARATION(dst),

578

uint src0_stride_z,

579

uint src1_stride_z,

580

uint dst_stride_z)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

581

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

582

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

583

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

584

int z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

585

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

586

// Offset

587

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

588

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

589

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

590

// src_addr_a = address of matrix A

591

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

592

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

593

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

594

595

#if defined(MATRIX_B_DEPTH)

596

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

597

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

598

#else // defined(MATRIX_B_DEPTH)

599

src1_addr_in_bytes += z * src1_stride_z;

600

#endif // defined(MATRIX_B_DEPTH)

601

602

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

603

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

604

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

605

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

606

__global half *src_end_addr_b = src_addr_b + COLS_B;

607

608

src_addr_a += offset_row_a;

609

src_addr_b += offset_row_b;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

610

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

611

// Reset accumulators

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

half8 c00 = 0.0f;

half8 c10 = 0.0f;

half8 c20 = 0.0f;

half8 c30 = 0.0f;

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

617

for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

618

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

619

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

620

half4 a0 = vload4(0, src_addr_a);

621

half8 b0 = vload8(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

622

623

c00 += (half8)a0.s0 * b0;

624

c10 += (half8)a0.s1 * b0;

625

c20 += (half8)a0.s2 * b0;

626

c30 += (half8)a0.s3 * b0;

627

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

628

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

629

a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);

630

b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

631

632

c00 += (half8)a0.s0 * b0;

633

c10 += (half8)a0.s1 * b0;

634

c20 += (half8)a0.s2 * b0;

635

c30 += (half8)a0.s3 * b0;

636

}

637

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

638

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

639

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

640

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

641

half4 a0 = vload4(0, src_addr_a);

642

half8 b0 = vload8(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

643

644

c00 += (half8)a0.s0 * b0;

645

c10 += (half8)a0.s1 * b0;

646

c20 += (half8)a0.s2 * b0;

647

c30 += (half8)a0.s3 * b0;

648

}

649

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

650

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

651

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

652

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

653

#if defined(ALPHA)

654

// Multiply by the weight of matrix product

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

655

c00 = c00 * (half8)ALPHA;

656

c10 = c10 * (half8)ALPHA;

657

c20 = c20 * (half8)ALPHA;

658

c30 = c30 * (half8)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

659

#endif // defined(ALPHA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

660

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

661

// Compute dst address

662

__global uchar *dst_addr = offset(&dst, 0, 0);

663

664

// Add offset for batched GEMM

665

dst_addr += z * dst_stride_z;

666

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

667

// Store 4x8 block

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

668

vstore8(c00, 0, (__global half *)(dst_addr + 0 * dst_stride_y));

669

vstore8(c10, 0, (__global half *)(dst_addr + 1 * dst_stride_y));

670

vstore8(c20, 0, (__global half *)(dst_addr + 2 * dst_stride_y));

671

vstore8(c30, 0, (__global half *)(dst_addr + 3 * dst_stride_y));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

672

}

Gian Marco Iodice

bb36a8e

2018-04-19 12:05:08 +0100

[diff] [blame]

673

674

/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A (src0) and matrix B (src1)

675

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication

676

*

677

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

678

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)

679

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

680

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

681

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

682

*

683

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

684

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

685

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

686

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

687

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

688

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

689

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

690

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

691

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

692

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

693

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

694

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

695

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

696

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

697

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

698

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

699

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

700

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

701

*/

702

__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),

703

IMAGE_DECLARATION(src1),

704

IMAGE_DECLARATION(dst),

uint src0_stride_z,

uint src1_stride_z,

uint dst_stride_z)

{

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

710

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

711

int z = get_global_id(2);

712

713

// Offset

714

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

715

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

716

717

// src_addr_a = address of matrix A

718

// src_addr_b = address of matrix B

719

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

720

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

721

722

#if defined(MATRIX_B_DEPTH)

723

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

724

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

725

#else // defined(MATRIX_B_DEPTH)

726

src1_addr_in_bytes += z * src1_stride_z;

727

#endif // defined(MATRIX_B_DEPTH)

728

729

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

730

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

731

732

// Compute end row address for matrix B

733

__global half *src_end_addr_b = src_addr_b + COLS_B;

734

735

src_addr_a += offset_row_a;

736

src_addr_b += offset_row_b;

737

738

// Reset accumulators

half8 c00 = 0.0f;

half8 c10 = 0.0f;

half8 c20 = 0.0f;

half8 c30 = 0.0f;

#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH))

745

746

int i = 0;

747

for(; i <= (int)(COLS_MTX_B - 4); i += 4)

748

{

749

#if MULT_INTERLEAVE4X4_HEIGHT == 1

750

// Load values from matrix A (interleaved) and matrix B (transposed)

751

half8 a0 = vload8(0, src_addr_a);

752

half8 b0 = vload8(0, src_addr_b);

753

754

src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;

755

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

756

757

c00 = fma((half8)a0.s0, b0, c00);

758

c10 = fma((half8)a0.s1, b0, c10);

759

c20 = fma((half8)a0.s2, b0, c20);

760

c30 = fma((half8)a0.s3, b0, c30);

761

762

// Load values from matrix B (transposed)

763

b0 = vload8(0, src_addr_b);

764

765

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

766

767

c00 = fma((half8)a0.s4, b0, c00);

768

c10 = fma((half8)a0.s5, b0, c10);

769

c20 = fma((half8)a0.s6, b0, c20);

770

c30 = fma((half8)a0.s7, b0, c30);

771

772

// Load values from matrix A (interleaved) and matrix B (transposed)

773

a0 = vload8(0, src_addr_a);

774

b0 = vload8(0, src_addr_b);

775

776

src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;

777

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

778

779

c00 = fma((half8)a0.s0, b0, c00);

780

c10 = fma((half8)a0.s1, b0, c10);

781

c20 = fma((half8)a0.s2, b0, c20);

782

c30 = fma((half8)a0.s3, b0, c30);

783

784

// Load values from matrix B (transposed)

785

b0 = vload8(0, src_addr_b);

786

787

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

788

789

c00 = fma((half8)a0.s4, b0, c00);

790

c10 = fma((half8)a0.s5, b0, c10);

791

c20 = fma((half8)a0.s6, b0, c20);

792

c30 = fma((half8)a0.s7, b0, c30);

793

#else // MULT_INTERLEAVE4X4_HEIGHT == 1

794

// Load values from matrix A (interleaved) and matrix B (transposed)

795

half4 a0 = vload4(0, src_addr_a);

796

half8 b0 = vload8(0, src_addr_b);

797

798

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

799

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

800

801

c00 = fma((half8)a0.s0, b0, c00);

802

c10 = fma((half8)a0.s1, b0, c10);

803

c20 = fma((half8)a0.s2, b0, c20);

804

c30 = fma((half8)a0.s3, b0, c30);

805

806

// Load values from matrix A (interleaved) and matrix B (transposed)

807

a0 = vload4(0, src_addr_a);

808

b0 = vload8(0, src_addr_b);

809

810

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

811

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

812

813

c00 = fma((half8)a0.s0, b0, c00);

814

c10 = fma((half8)a0.s1, b0, c10);

815

c20 = fma((half8)a0.s2, b0, c20);

816

c30 = fma((half8)a0.s3, b0, c30);

817

818

// Load values from matrix A (interleaved) and matrix B (transposed)

819

a0 = vload4(0, src_addr_a);

820

b0 = vload8(0, src_addr_b);

821

822

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

823

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

824

825

c00 = fma((half8)a0.s0, b0, c00);

826

c10 = fma((half8)a0.s1, b0, c10);

827

c20 = fma((half8)a0.s2, b0, c20);

828

c30 = fma((half8)a0.s3, b0, c30);

829

830

// Load values from matrix A (interleaved) and matrix B (transposed)

831

a0 = vload4(0, src_addr_a);

832

b0 = vload8(0, src_addr_b);

833

834

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

835

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

836

837

c00 = fma((half8)a0.s0, b0, c00);

838

c10 = fma((half8)a0.s1, b0, c10);

839

c20 = fma((half8)a0.s2, b0, c20);

840

c30 = fma((half8)a0.s3, b0, c30);

841

#endif // MULT_INTERLEAVE4X4_HEIGHT == 1

842

}

843

844

for(; i < (int)(COLS_MTX_B); ++i)

845

{

846

// Load values from matrix A (interleaved) and matrix B (transposed)

847

half4 a0 = vload4(0, src_addr_a);

848

half8 b0 = vload8(0, src_addr_b);

849

850

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

851

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

852

853

c00 = fma((half8)a0.s0, b0, c00);

854

c10 = fma((half8)a0.s1, b0, c10);

855

c20 = fma((half8)a0.s2, b0, c20);

856

c30 = fma((half8)a0.s3, b0, c30);

857

}

858

859

// Compute destination address

860

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

861

862

#if defined(ALPHA)

863

// Multiply by the weight of matrix product

864

c00 = c00 * (half8)ALPHA;

865

c10 = c10 * (half8)ALPHA;

866

c20 = c20 * (half8)ALPHA;

867

c30 = c30 * (half8)ALPHA;

868

#endif // defined(ALPHA)

869

870

// Compute dst address

871

__global uchar *dst_addr = offset(&dst, 0, 0);

872

873

// Add offset for batched GEMM

874

dst_addr += z * dst_stride_z;

875

876

// Store 4x8 block

877

vstore8(c00, 0, (__global half *)(dst_addr + 0 * dst_stride_y));

878

vstore8(c10, 0, (__global half *)(dst_addr + 1 * dst_stride_y));

879

vstore8(c20, 0, (__global half *)(dst_addr + 2 * dst_stride_y));

880

vstore8(c30, 0, (__global half *)(dst_addr + 3 * dst_stride_y));

881

}

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

882

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

883

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

884

#if defined(FIXED_POINT_POSITION)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

885

/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 8 bit fixed point precision

886

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication

887

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

888

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

889

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)

890

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

891

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

892

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

893

* @note:ALPHA must be passed in 8 bit fixed point format

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

894

*

895

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8

896

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

897

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

898

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

899

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

900

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

901

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

902

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

903

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

904

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

905

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

906

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

907

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

908

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

909

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

910

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

911

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

912

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

913

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

914

__kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),

915

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

916

IMAGE_DECLARATION(dst),

917

uint src0_stride_z,

918

uint src1_stride_z,

919

uint dst_stride_z)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

920

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

921

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

922

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

923

int z = get_global_id(2);

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

924

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

925

// Offset

926

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

927

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 16;

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

928

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

929

// src_addr_a = address of matrix A

930

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

931

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

932

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

933

934

#if defined(MATRIX_B_DEPTH)

935

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

936

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

937

#else // defined(MATRIX_B_DEPTH)

938

src1_addr_in_bytes += z * src1_stride_z;

939

#endif // defined(MATRIX_B_DEPTH)

940

941

__global char *src_addr_a = (__global char *)(src0_ptr + src0_addr_in_bytes);

942

__global char *src_addr_b = (__global char *)(src1_ptr + src1_addr_in_bytes);

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

943

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

944

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

945

__global char *src_end_addr_b = src_addr_b + COLS_B;

946

947

src_addr_a += offset_row_a;

948

src_addr_b += offset_row_b;

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

949

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

950

// Reset accumulators

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

short8 c00 = 0.0f;

short8 c10 = 0.0f;

short8 c20 = 0.0f;

short8 c30 = 0.0f;

short8 c01 = 0.0f;

short8 c11 = 0.0f;

short8 c21 = 0.0f;

short8 c31 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

960

// This for loop performs 1 accumulation for each iteration

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

961

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

962

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

963

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

964

char4 a0 = vload4(0, src_addr_a);

965

char16 b0 = vload16(0, src_addr_b);

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

966

967

c00 = mlal_sat_qs8x8(c00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);

968

c10 = mlal_sat_qs8x8(c10, (char8)a0.s1, b0.s01234567, FIXED_POINT_POSITION);

969

c20 = mlal_sat_qs8x8(c20, (char8)a0.s2, b0.s01234567, FIXED_POINT_POSITION);

970

c30 = mlal_sat_qs8x8(c30, (char8)a0.s3, b0.s01234567, FIXED_POINT_POSITION);

971

972

c01 = mlal_sat_qs8x8(c01, (char8)a0.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);

973

c11 = mlal_sat_qs8x8(c11, (char8)a0.s1, b0.s89ABCDEF, FIXED_POINT_POSITION);

974

c21 = mlal_sat_qs8x8(c21, (char8)a0.s2, b0.s89ABCDEF, FIXED_POINT_POSITION);

975

c31 = mlal_sat_qs8x8(c31, (char8)a0.s3, b0.s89ABCDEF, FIXED_POINT_POSITION);

976

}

977

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

978

// Compute destination address

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

979

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

980

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

981

// Multiply by the weight of matrix product

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

982

char16 c00_qs8 = convert_char16_sat((short16)(c00, c01));

983

char16 c10_qs8 = convert_char16_sat((short16)(c10, c11));

984

char16 c20_qs8 = convert_char16_sat((short16)(c20, c21));

985

char16 c30_qs8 = convert_char16_sat((short16)(c30, c31));

986

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

987

#if defined(ALPHA)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

988

c00_qs8 = mul_sat_qs8x16(c00_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

989

c10_qs8 = mul_sat_qs8x16(c10_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

990

c20_qs8 = mul_sat_qs8x16(c20_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

991

c30_qs8 = mul_sat_qs8x16(c30_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

992

#endif // defined(ALPHA)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

993

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

994

// Compute dst address

995

__global uchar *dst_addr = offset(&dst, 0, 0);

996

997

// Add offset for batched GEMM

998

dst_addr += z * dst_stride_z;

999

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1000

// Store 16x4 block

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1001

vstore16(c00_qs8, 0, (__global char *)(dst_addr + 0 * dst_stride_y));

1002

vstore16(c10_qs8, 0, (__global char *)(dst_addr + 1 * dst_stride_y));

1003

vstore16(c20_qs8, 0, (__global char *)(dst_addr + 2 * dst_stride_y));

1004

vstore16(c30_qs8, 0, (__global char *)(dst_addr + 3 * dst_stride_y));

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1005

}

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1006

1007

/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 16 bit fixed point precision

1008

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication

1009

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1010

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

1011

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)

1012

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1013

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

1014

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

1015

* @note:ALPHA must be passed in 16 bit fixed point format

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1016

*

1017

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS16

1018

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1019

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1020

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1021

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1022

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1023

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

1024

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1025

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1026

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1027

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1028

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1029

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

1030

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

1031

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1032

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

1033

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1034

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1035

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1036

__kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),

1037

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1038

IMAGE_DECLARATION(dst),

1039

uint src0_stride_z,

1040

uint src1_stride_z,

1041

uint dst_stride_z)

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1042

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

1043

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

1044

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1045

int z = get_global_id(2);

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1046

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

1047

// Offset

1048

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

1049

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1050

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

1051

// src_addr_a = address of matrix A

1052

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1053

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

1054

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

1055

1056

#if defined(MATRIX_B_DEPTH)

1057

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1058

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

1059

#else // defined(MATRIX_B_DEPTH)

1060

src1_addr_in_bytes += z * src1_stride_z;

1061

#endif // defined(MATRIX_B_DEPTH)

1062

1063

__global short *src_addr_a = (__global short *)(src0_ptr + src0_addr_in_bytes);

1064

__global short *src_addr_b = (__global short *)(src1_ptr + src1_addr_in_bytes);

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1065

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1066

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

1067

__global short *src_end_addr_b = src_addr_b + COLS_B;

1068

1069

src_addr_a += offset_row_a;

1070

src_addr_b += offset_row_b;

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1071

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1072

// Reset accumulators

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

int8 c00 = 0.0f;

int8 c10 = 0.0f;

int8 c20 = 0.0f;

int8 c30 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1078

// This for loop performs 1 accumulation for each iteration

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

1079

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1080

{

1081

/* Load values from matrix A (interleaved) and matrix B (transposed) */

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

1082

short4 a0 = vload4(0, src_addr_a);

1083

short8 b0 = vload8(0, src_addr_b);

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1084

1085

c00 = mlal_sat_qs16x8(c00, (short8)a0.s0, b0, FIXED_POINT_POSITION);

1086

c10 = mlal_sat_qs16x8(c10, (short8)a0.s1, b0, FIXED_POINT_POSITION);

1087

c20 = mlal_sat_qs16x8(c20, (short8)a0.s2, b0, FIXED_POINT_POSITION);

1088

c30 = mlal_sat_qs16x8(c30, (short8)a0.s3, b0, FIXED_POINT_POSITION);

1089

}

1090

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1091

// Compute destination address

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1092

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1093

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1094

// Multiply by the weight of matrix product

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1095

short8 c00_qs16 = convert_short8_sat(c00);

1096

short8 c10_qs16 = convert_short8_sat(c10);

1097

short8 c20_qs16 = convert_short8_sat(c20);

1098

short8 c30_qs16 = convert_short8_sat(c30);

1099

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1100

#if defined(ALPHA)

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1101

c00_qs16 = mul_sat_qs16x8(c00_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

1102

c10_qs16 = mul_sat_qs16x8(c10_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

1103

c20_qs16 = mul_sat_qs16x8(c20_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

1104

c30_qs16 = mul_sat_qs16x8(c30_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1105

#endif // defined(ALPHA)

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1106

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1107

// Compute dst address

1108

__global uchar *dst_addr = offset(&dst, 0, 0);

1109

1110

// Add offset for batched GEMM

1111

dst_addr += z * dst_stride_z;

1112

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1113

// Store 8x4 block

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1114

vstore8(c00_qs16, 0, (__global short *)(dst_addr + 0 * dst_stride_y));

1115

vstore8(c10_qs16, 0, (__global short *)(dst_addr + 1 * dst_stride_y));

1116

vstore8(c20_qs16, 0, (__global short *)(dst_addr + 2 * dst_stride_y));

1117

vstore8(c30_qs16, 0, (__global short *)(dst_addr + 3 * dst_stride_y));

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1118

}

1119

#endif // defined(FIXED_POINT_POSITION)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

1120

#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1121

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1122

#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

1123

#if defined(DATA_TYPE)

1124

#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)

1125

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1126

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1127

* @note This OpenCL kernel works with floating point data types (F16/F32)

1128

* @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

1129

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1130

* @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1131

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

1132

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1133

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1134

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1135

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1136

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1137

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1138

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1139

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1140

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1141

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1142

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1143

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1144

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1145

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1146

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1147

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1148

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1149

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1150

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1151

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1152

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1153

__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),

1154

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1155

IMAGE_DECLARATION(dst),

1156

uint src0_stride_z,

1157

uint src1_stride_z,

1158

uint dst_stride_z)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1159

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1160

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1161

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1162

// Compute starting address for matrix A and Matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1163

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1164

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1165

// Update address for the matrix A

1166

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1167

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1168

// Update address for the matrix B

1169

src_addr.s1 += idx * sizeof(DATA_TYPE);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1170

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1171

// Add offset for batched GEMM

1172

src_addr.s0 += get_global_id(2) * src0_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1173

1174

#if defined(MATRIX_B_DEPTH)

1175

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1176

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

1177

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1178

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1179

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1180

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1181

int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));

1182

1183

VECTOR_TYPE acc0 = 0.0f;

1184

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1185

VECTOR_TYPE acc1 = 0.0f;

1186

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1187

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1188

VECTOR_TYPE acc2 = 0.0f;

1189

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1190

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1191

VECTOR_TYPE acc3 = 0.0f;

1192

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1193

Georgios Pinitas

96880cf

2017-10-20 18:52:20 +0100

[diff] [blame]

1194

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1195

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1196

// Load values from matrix A

1197

VEC_DATA_TYPE(DATA_TYPE, 2)

1198

a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1199

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1200

VEC_DATA_TYPE(DATA_TYPE, 2)

1201

a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1202

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1203

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1204

VEC_DATA_TYPE(DATA_TYPE, 2)

1205

a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1206

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1207

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1208

VEC_DATA_TYPE(DATA_TYPE, 2)

1209

a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1210

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1211

// Load values from matrix B

1212

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

1213

VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1214

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1215

// Accumulate

1216

acc0 += b0 * (VECTOR_TYPE)a0.s0;

1217

acc0 += b1 * (VECTOR_TYPE)a0.s1;

1218

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1219

acc1 += b0 * (VECTOR_TYPE)a1.s0;

1220

acc1 += b1 * (VECTOR_TYPE)a1.s1;

1221

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1222

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1223

acc2 += b0 * (VECTOR_TYPE)a2.s0;

1224

acc2 += b1 * (VECTOR_TYPE)a2.s1;

1225

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1226

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1227

acc3 += b0 * (VECTOR_TYPE)a3.s0;

1228

acc3 += b1 * (VECTOR_TYPE)a3.s1;

1229

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1230

}

1231

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1232

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1233

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1234

// Load values from matrix A

1235

DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1236

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1237

DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1238

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1239

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1240

DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1241

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1242

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1243

DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1244

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1245

// Load values from matrix B

1246

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1247

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1248

// Accumulate

1249

acc0 += b0 * (VECTOR_TYPE)a0;

1250

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1251

acc1 += b0 * (VECTOR_TYPE)a1;

1252

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1253

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1254

acc2 += b0 * (VECTOR_TYPE)a2;

1255

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1256

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1257

acc3 += b0 * (VECTOR_TYPE)a3;

1258

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1259

}

1260

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1261

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1262

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1263

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1264

// Compute dst address

1265

__global uchar *dst_addr = offset(&dst, 0, 0);

1266

1267

// Add offset for batched GEMM

1268

dst_addr += get_global_id(2) * dst_stride_z;

1269

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1270

// Multiply by the weight of matrix-matrix product and store the result

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1271

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1272

acc0 = acc0 * (VECTOR_TYPE)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1273

#endif // defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1274

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1275

(acc0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1276

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1277

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1278

acc1 = acc1 * (VECTOR_TYPE)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1279

#endif // defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1280

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1281

(acc1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1282

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1283

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1284

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1285

acc2 = acc2 * (VECTOR_TYPE)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1286

#endif // defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1287

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1288

(acc2, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1289

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1290

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1291

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1292

acc3 = acc3 * (VECTOR_TYPE)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1293

#endif // defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1294

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1295

(acc3, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1296

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1297

}

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1298

#endif // defined(DATA_TYPE)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1299

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1300

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

1301

*

1302

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

1303

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

1304

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

1305

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

1306

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1307

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

1308

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1309

*

1310

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32

1311

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1312

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1313

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1314

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1315

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1316

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

1317

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1318

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1319

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1320

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1321

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1322

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

1323

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1324

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1325

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1326

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1327

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1328

*/

1329

__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),

1330

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1331

IMAGE_DECLARATION(dst),

1332

uint src0_stride_z,

1333

uint src1_stride_z,

1334

uint dst_stride_z)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1335

{

1336

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

1337

1338

// Compute starting address for matrix A and matrix B

1339

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

1340

1341

// Update address for matrix A

1342

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

1343

1344

// Update address for matrix B

1345

src_addr.s1 += idx * sizeof(float);

1346

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1347

// Add offset for batched GEMM

1348

src_addr.s0 += get_global_id(2) * src0_stride_z;

1349

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1350

#if defined(MATRIX_B_DEPTH)

1351

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1352

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

1353

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1354

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1355

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1356

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1357

// Initialize accumulators

float acc00 = 0.0f;

float acc01 = 0.0f;

float acc02 = 0.0f;

float acc03 = 0.0f;

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

float acc10 = 0.0f;

float acc11 = 0.0f;

float acc12 = 0.0f;

float acc13 = 0.0f;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1369

1370

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

float acc20 = 0.0f;

float acc21 = 0.0f;

float acc22 = 0.0f;

float acc23 = 0.0f;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1376

1377

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

float acc30 = 0.0f;

float acc31 = 0.0f;

float acc32 = 0.0f;

float acc33 = 0.0f;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1383

1384

// A and B src indices get incremented at the same time.

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1385

int i = 0;

1386

for(; i <= ((int)COLS_A - 4); i += 4)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1387

{

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1388

// Load values from matrix A and matrix B

1389

float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1390

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1391

float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1392

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1393

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1394

float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1395

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1396

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1397

float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1398

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1399

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

1400

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1401

1402

// Multiply and accumulate

1403

acc00 = fma(a0.s0, b0.s0, acc00);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1404

acc01 = fma(a0.s0, b0.s1, acc01);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1405

acc02 = fma(a0.s0, b0.s2, acc02);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1406

acc03 = fma(a0.s0, b0.s3, acc03);

1407

1408

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1409

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1410

acc10 = fma(a1.s0, b0.s0, acc10);

1411

acc11 = fma(a1.s0, b0.s1, acc11);

1412

acc12 = fma(a1.s0, b0.s2, acc12);

1413

acc13 = fma(a1.s0, b0.s3, acc13);

1414

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1415

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1416

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1417

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1418

acc20 = fma(a2.s0, b0.s0, acc20);

1419

acc21 = fma(a2.s0, b0.s1, acc21);

1420

acc22 = fma(a2.s0, b0.s2, acc22);

1421

acc23 = fma(a2.s0, b0.s3, acc23);

1422

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1423

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1424

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1425

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1426

acc30 = fma(a3.s0, b0.s0, acc30);

1427

acc31 = fma(a3.s0, b0.s1, acc31);

1428

acc32 = fma(a3.s0, b0.s2, acc32);

1429

acc33 = fma(a3.s0, b0.s3, acc33);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1430

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1431

1432

// Load values from matrix A and matrix B

1433

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

1434

src_addr.s1 += src1_stride_y;

1435

1436

// Multiply and accumulate

1437

acc00 = fma(a0.s1, b0.s0, acc00);

1438

acc01 = fma(a0.s1, b0.s1, acc01);

1439

acc02 = fma(a0.s1, b0.s2, acc02);

1440

acc03 = fma(a0.s1, b0.s3, acc03);

1441

1442

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1443

1444

acc10 = fma(a1.s1, b0.s0, acc10);

1445

acc11 = fma(a1.s1, b0.s1, acc11);

1446

acc12 = fma(a1.s1, b0.s2, acc12);

1447

acc13 = fma(a1.s1, b0.s3, acc13);

1448

1449

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1450

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1451

1452

acc20 = fma(a2.s1, b0.s0, acc20);

1453

acc21 = fma(a2.s1, b0.s1, acc21);

1454

acc22 = fma(a2.s1, b0.s2, acc22);

1455

acc23 = fma(a2.s1, b0.s3, acc23);

1456

1457

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1458

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1459

1460

acc30 = fma(a3.s1, b0.s0, acc30);

1461

acc31 = fma(a3.s1, b0.s1, acc31);

1462

acc32 = fma(a3.s1, b0.s2, acc32);

1463

acc33 = fma(a3.s1, b0.s3, acc33);

1464

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1465

1466

// Load values from matrix A and matrix B

1467

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

1468

src_addr.s1 += src1_stride_y;

1469

1470

// Multiply and accumulate

1471

acc00 = fma(a0.s2, b0.s0, acc00);

1472

acc01 = fma(a0.s2, b0.s1, acc01);

1473

acc02 = fma(a0.s2, b0.s2, acc02);

1474

acc03 = fma(a0.s2, b0.s3, acc03);

1475

1476

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1477

1478

acc10 = fma(a1.s2, b0.s0, acc10);

1479

acc11 = fma(a1.s2, b0.s1, acc11);

1480

acc12 = fma(a1.s2, b0.s2, acc12);

1481

acc13 = fma(a1.s2, b0.s3, acc13);

1482

1483

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1484

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1485

1486

acc20 = fma(a2.s2, b0.s0, acc20);

1487

acc21 = fma(a2.s2, b0.s1, acc21);

1488

acc22 = fma(a2.s2, b0.s2, acc22);

1489

acc23 = fma(a2.s2, b0.s3, acc23);

1490

1491

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1492

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1493

1494

acc30 = fma(a3.s2, b0.s0, acc30);

1495

acc31 = fma(a3.s2, b0.s1, acc31);

1496

acc32 = fma(a3.s2, b0.s2, acc32);

1497

acc33 = fma(a3.s2, b0.s3, acc33);

1498

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1499

1500

// Load values from matrix A and matrix B

1501

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

1502

src_addr.s1 += src1_stride_y;

1503

1504

// Multiply and accumulate

1505

acc00 = fma(a0.s3, b0.s0, acc00);

1506

acc01 = fma(a0.s3, b0.s1, acc01);

1507

acc02 = fma(a0.s3, b0.s2, acc02);

1508

acc03 = fma(a0.s3, b0.s3, acc03);

1509

1510

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1511

1512

acc10 = fma(a1.s3, b0.s0, acc10);

1513

acc11 = fma(a1.s3, b0.s1, acc11);

1514

acc12 = fma(a1.s3, b0.s2, acc12);

1515

acc13 = fma(a1.s3, b0.s3, acc13);

1516

1517

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1518

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1519

1520

acc20 = fma(a2.s3, b0.s0, acc20);

1521

acc21 = fma(a2.s3, b0.s1, acc21);

1522

acc22 = fma(a2.s3, b0.s2, acc22);

1523

acc23 = fma(a2.s3, b0.s3, acc23);

1524

1525

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1526

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1527

1528

acc30 = fma(a3.s3, b0.s0, acc30);

1529

acc31 = fma(a3.s3, b0.s1, acc31);

1530

acc32 = fma(a3.s3, b0.s2, acc32);

1531

acc33 = fma(a3.s3, b0.s3, acc33);

1532

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1533

1534

src_addr.s0 += 4 * sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1535

}

1536

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1537

for(; i < (int)COLS_A; ++i)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1538

{

1539

// Load values from matrix A

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1540

float a0 = *((__global float *)(src0_ptr + src_addr.s0));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1541

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1542

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1543

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1544

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1545

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1546

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1547

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1548

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1549

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1550

// Load values from matrix B

1551

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1552

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1553

1554

// Multiply and accumulate

1555

acc00 = fma(a0, b0.s0, acc00);

1556

acc01 = fma(a0, b0.s1, acc01);

1557

acc02 = fma(a0, b0.s2, acc02);

1558

acc03 = fma(a0, b0.s3, acc03);

1559

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1560

acc10 = fma(a1, b0.s0, acc10);

1561

acc11 = fma(a1, b0.s1, acc11);

1562

acc12 = fma(a1, b0.s2, acc12);

1563

acc13 = fma(a1, b0.s3, acc13);

1564

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1565

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1566

acc20 = fma(a2, b0.s0, acc20);

1567

acc21 = fma(a2, b0.s1, acc21);

1568

acc22 = fma(a2, b0.s2, acc22);

1569

acc23 = fma(a2, b0.s3, acc23);

1570

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1571

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1572

acc30 = fma(a3, b0.s0, acc30);

1573

acc31 = fma(a3, b0.s1, acc31);

1574

acc32 = fma(a3, b0.s2, acc32);

1575

acc33 = fma(a3, b0.s3, acc33);

1576

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1577

1578

src_addr.s0 += sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1579

}

1580

1581

// Compute destination address

1582

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1583

1584

// Multiply by the weight of matrix-matrix product and store the result

1585

#if defined(ALPHA)

1586

acc00 = acc00 * ALPHA;

1587

acc01 = acc01 * ALPHA;

1588

acc02 = acc02 * ALPHA;

1589

acc03 = acc03 * ALPHA;

1590

#endif // defined(ALPHA)

1591

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1592

// Compute dst address

1593

__global uchar *dst_addr = offset(&dst, 0, 0);

1594

1595

// Add offset for batched GEMM

1596

dst_addr += get_global_id(2) * dst_stride_z;

1597

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1598

float4 acc0 = ((float4)(acc00, acc01, acc02, acc03));

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1599

vstore4(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1600

1601

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1602

#if defined(ALPHA)

1603

acc10 = acc10 * ALPHA;

1604

acc11 = acc11 * ALPHA;

1605

acc12 = acc12 * ALPHA;

1606

acc13 = acc13 * ALPHA;

1607

#endif // defined(ALPHA)

1608

float4 acc1 = ((float4)(acc10, acc11, acc12, acc13));

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1609

vstore4(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1610

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1611

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1612

#if defined(ALPHA)

1613

acc20 = acc20 * ALPHA;

1614

acc21 = acc21 * ALPHA;

1615

acc22 = acc22 * ALPHA;

1616

acc23 = acc23 * ALPHA;

1617

#endif // defined(ALPHA)

1618

float4 acc2 = ((float4)(acc20, acc21, acc22, acc23));

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1619

vstore4(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1620

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1621

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1622

#if defined(ALPHA)

1623

acc30 = acc30 * ALPHA;

1624

acc31 = acc31 * ALPHA;

1625

acc32 = acc32 * ALPHA;

1626

acc33 = acc33 * ALPHA;

1627

#endif // defined(ALPHA)

1628

float4 acc3 = ((float4)(acc30, acc31, acc32, acc33));

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1629

vstore4(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1630

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1631

}

1632

1633

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

1634

*

1635

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

1636

* This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.

1637

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

1638

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.

1639

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

1640

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1641

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

1642

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1643

*

1644

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32

1645

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1646

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1647

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1648

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1649

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1650

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

1651

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1652

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1653

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1654

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1655

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1656

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

1657

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1658

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1659

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1660

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1661

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1662

*/

1663

__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),

1664

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1665

IMAGE_DECLARATION(dst),

1666

uint src0_stride_z,

1667

uint src1_stride_z,

1668

uint dst_stride_z)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1669

{

1670

// Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1671

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

1672

1673

// Compute starting address for matrix A and Matrix B

1674

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

1675

1676

// Update address for the matrix A

1677

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

1678

1679

// Update address for the matrix B

1680

src_addr.s1 += idx * sizeof(float);

1681

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1682

// Add offset for batched GEMM

1683

src_addr.s0 += get_global_id(2) * src0_stride_z;

1684

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1685

#if defined(MATRIX_B_DEPTH)

1686

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1687

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

1688

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1689

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1690

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1691

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1692

// Initialize accumulators

float acc00 = 0.0f;

float acc01 = 0.0f;

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1697

float acc10 = 0.0f;

1698

float acc11 = 0.0f;

1699

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1700

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1701

float acc20 = 0.0f;

1702

float acc21 = 0.0f;

1703

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1704

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1705

float acc30 = 0.0f;

1706

float acc31 = 0.0f;

1707

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1708

1709

// A and B src indices get incremented at the same time.

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1710

int i = 0;

1711

for(; i <= ((int)COLS_A - 8); i += 8)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1712

{

1713

// Load values from matrix A

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1714

float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1715

1716

// Load values from matrix B

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1717

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

1718

src_addr.s1 += src1_stride_y;

1719

float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

1720

src_addr.s1 += src1_stride_y;

1721

float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

1722

src_addr.s1 += src1_stride_y;

1723

float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

1724

src_addr.s1 += src1_stride_y;

1725

float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

1726

src_addr.s1 += src1_stride_y;

1727

float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

1728

src_addr.s1 += src1_stride_y;

1729

float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

1730

src_addr.s1 += src1_stride_y;

1731

float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

1732

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1733

1734

// Multiply and accumulate

1735

acc00 = fma(a0.s0, b0.s0, acc00);

1736

acc00 = fma(a0.s1, b1.s0, acc00);

1737

acc00 = fma(a0.s2, b2.s0, acc00);

1738

acc00 = fma(a0.s3, b3.s0, acc00);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1739

acc00 = fma(a0.s4, b4.s0, acc00);

1740

acc00 = fma(a0.s5, b5.s0, acc00);

1741

acc00 = fma(a0.s6, b6.s0, acc00);

1742

acc00 = fma(a0.s7, b7.s0, acc00);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1743

1744

acc01 = fma(a0.s0, b0.s1, acc01);

1745

acc01 = fma(a0.s1, b1.s1, acc01);

1746

acc01 = fma(a0.s2, b2.s1, acc01);

1747

acc01 = fma(a0.s3, b3.s1, acc01);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1748

acc01 = fma(a0.s4, b4.s1, acc01);

1749

acc01 = fma(a0.s5, b5.s1, acc01);

1750

acc01 = fma(a0.s6, b6.s1, acc01);

1751

acc01 = fma(a0.s7, b7.s1, acc01);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1752

1753

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1754

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1755

acc10 = fma(a0.s0, b0.s0, acc10);

1756

acc10 = fma(a0.s1, b1.s0, acc10);

1757

acc10 = fma(a0.s2, b2.s0, acc10);

1758

acc10 = fma(a0.s3, b3.s0, acc10);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1759

acc10 = fma(a0.s4, b4.s0, acc10);

1760

acc10 = fma(a0.s5, b5.s0, acc10);

1761

acc10 = fma(a0.s6, b6.s0, acc10);

1762

acc10 = fma(a0.s7, b7.s0, acc10);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1763

1764

acc11 = fma(a0.s0, b0.s1, acc11);

1765

acc11 = fma(a0.s1, b1.s1, acc11);

1766

acc11 = fma(a0.s2, b2.s1, acc11);

1767

acc11 = fma(a0.s3, b3.s1, acc11);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1768

acc11 = fma(a0.s4, b4.s1, acc11);

1769

acc11 = fma(a0.s5, b5.s1, acc11);

1770

acc11 = fma(a0.s6, b6.s1, acc11);

1771

acc11 = fma(a0.s7, b7.s1, acc11);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1772

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1773

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1774

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1775

acc20 = fma(a0.s0, b0.s0, acc20);

1776

acc20 = fma(a0.s1, b1.s0, acc20);

1777

acc20 = fma(a0.s2, b2.s0, acc20);

1778

acc20 = fma(a0.s3, b3.s0, acc20);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1779

acc20 = fma(a0.s4, b4.s0, acc20);

1780

acc20 = fma(a0.s5, b5.s0, acc20);

1781

acc20 = fma(a0.s6, b6.s0, acc20);

1782

acc20 = fma(a0.s7, b7.s0, acc20);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1783

1784

acc21 = fma(a0.s0, b0.s1, acc21);

1785

acc21 = fma(a0.s1, b1.s1, acc21);

1786

acc21 = fma(a0.s2, b2.s1, acc21);

1787

acc21 = fma(a0.s3, b3.s1, acc21);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1788

acc21 = fma(a0.s4, b4.s1, acc21);

1789

acc21 = fma(a0.s5, b5.s1, acc21);

1790

acc21 = fma(a0.s6, b6.s1, acc21);

1791

acc21 = fma(a0.s7, b7.s1, acc21);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1792

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1793

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1794

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1795

acc30 = fma(a0.s0, b0.s0, acc30);

1796

acc30 = fma(a0.s1, b1.s0, acc30);

1797

acc30 = fma(a0.s2, b2.s0, acc30);

1798

acc30 = fma(a0.s3, b3.s0, acc30);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1799

acc30 = fma(a0.s4, b4.s0, acc30);

1800

acc30 = fma(a0.s5, b5.s0, acc30);

1801

acc30 = fma(a0.s6, b6.s0, acc30);

1802

acc30 = fma(a0.s7, b7.s0, acc30);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1803

1804

acc31 = fma(a0.s0, b0.s1, acc31);

1805

acc31 = fma(a0.s1, b1.s1, acc31);

1806

acc31 = fma(a0.s2, b2.s1, acc31);

1807

acc31 = fma(a0.s3, b3.s1, acc31);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1808

acc31 = fma(a0.s4, b4.s1, acc31);

1809

acc31 = fma(a0.s5, b5.s1, acc31);

1810

acc31 = fma(a0.s6, b6.s1, acc31);

1811

acc31 = fma(a0.s7, b7.s1, acc31);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1812

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1813

1814

src_addr.s0 += sizeof(float) * 8;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1815

}

1816

// float size increment

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1817

for(; i < (int)COLS_A; ++i)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1818

{

1819

// Load values from matrix A

1820

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1821

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1822

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1823

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1824

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1825

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1826

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1827

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1828

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1829

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1830

// Load values from matrix B

1831

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1832

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1833

1834

// Multiply and accumulate

1835

acc00 = fma(a0, b0.s0, acc00);

1836

acc01 = fma(a0, b0.s1, acc01);

1837

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1838

acc10 = fma(a1, b0.s0, acc10);

1839

acc11 = fma(a1, b0.s1, acc11);

1840

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1841

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1842

acc20 = fma(a2, b0.s0, acc20);

1843

acc21 = fma(a2, b0.s1, acc21);

1844

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1845

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1846

acc30 = fma(a3, b0.s0, acc30);

1847

acc31 = fma(a3, b0.s1, acc31);

1848

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1849

1850

src_addr.s0 += sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1851

}

1852

1853

// Compute destination address

1854

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1855

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1856

// Compute dst address

1857

__global uchar *dst_addr = offset(&dst, 0, 0);

1858

1859

// Add offset for batched GEMM

1860

dst_addr += get_global_id(2) * dst_stride_z;

1861

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1862

// Multiply by the weight of matrix-matrix product and store the result

1863

#if defined(ALPHA)

1864

acc00 = acc00 * ALPHA;

1865

acc01 = acc01 * ALPHA;

1866

#endif // defined(ALPHA)

1867

float2 acc0 = ((float2)(acc00, acc01));

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1868

vstore2(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1869

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1870

#if defined(ALPHA)

1871

acc10 = acc10 * ALPHA;

1872

acc11 = acc11 * ALPHA;

1873

#endif // defined(ALPHA)

1874

float2 acc1 = ((float2)(acc10, acc11));

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1875

vstore2(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1876

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1877

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1878

#if defined(ALPHA)

1879

acc20 = acc20 * ALPHA;

1880

acc21 = acc21 * ALPHA;

1881

#endif // defined(ALPHA)

1882

float2 acc2 = ((float2)(acc20, acc21));

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1883

vstore2(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1884

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1885

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1886

#if defined(ALPHA)

1887

acc30 = acc30 * ALPHA;

1888

acc31 = acc31 * ALPHA;

1889

#endif // defined(ALPHA)

1890

float2 acc3 = (float2)(acc30, acc31);

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1891

vstore2(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1892

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1893

}

1894

Gian Marco Iodice

fd68311

2018-04-17 09:52:44 +0100

[diff] [blame]

1895

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

1896

*

1897

* @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.

1898

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

1899

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

1900

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

1901

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

1902

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

1903

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

1904

*

1905

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

1906

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1907

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1908

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1909

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1910

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1911

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

1912

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1913

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1914

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1915

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1916

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1917

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

1918

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1919

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1920

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1921

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1922

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1923

*/

1924

__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),

1925

IMAGE_DECLARATION(src1),

1926

IMAGE_DECLARATION(dst),

uint src0_stride_z,

uint src1_stride_z,

uint dst_stride_z)

{

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

1932

1933

// Compute starting address for matrix A and Matrix B

1934

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

1935

1936

// Update address for the matrix A

1937

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

1938

1939

// Update address for the matrix B

1940

src_addr.s1 += idx * sizeof(half);

1941

1942

// Add offset for batched GEMM

1943

src_addr.s0 += get_global_id(2) * src0_stride_z;

1944

1945

#if defined(MATRIX_B_DEPTH)

1946

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1947

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

1948

#else // defined(MATRIX_B_DEPTH)

1949

src_addr.s1 += get_global_id(2) * src1_stride_z;

1950

#endif // defined(MATRIX_B_DEPTH)

1951

1952

half8 acc0 = 0.0h;

1953

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1954

half8 acc1 = 0.0h;

1955

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1956

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1957

half8 acc2 = 0.0h;

1958

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1959

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1960

half8 acc3 = 0.0h;

1961

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1962

1963

int i = 0;

1964

for(; i <= ((int)COLS_A - 4); i += 4)

1965

{

1966

// Load values from matrix A

1967

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1968

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1969

half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1970

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1971

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1972

half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1973

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1974

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1975

half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1976

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1977

// Load values from matrix B

1978

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

1979

src_addr.s1 += src1_stride_y;

1980

1981

// Accumulate

1982

acc0 = fma(b0, (half8)a0.s0, acc0);

1983

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1984

acc1 = fma(b0, (half8)a1.s0, acc1);

1985

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1986

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1987

acc2 = fma(b0, (half8)a2.s0, acc2);

1988

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1989

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1990

acc3 = fma(b0, (half8)a3.s0, acc3);

1991

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1992

1993

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

1994

src_addr.s1 += src1_stride_y;

1995

acc0 = fma(b0, (half8)a0.s1, acc0);

1996

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1997

acc1 = fma(b0, (half8)a1.s1, acc1);

1998

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1999

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2000

acc2 = fma(b0, (half8)a2.s1, acc2);

2001

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2002

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2003

acc3 = fma(b0, (half8)a3.s1, acc3);

2004

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2005

2006

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

2007

src_addr.s1 += src1_stride_y;

2008

acc0 = fma(b0, (half8)a0.s2, acc0);

2009

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2010

acc1 = fma(b0, (half8)a1.s2, acc1);

2011

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2012

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2013

acc2 = fma(b0, (half8)a2.s2, acc2);

2014

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2015

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2016

acc3 = fma(b0, (half8)a3.s2, acc3);

2017

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2018

2019

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

2020

src_addr.s1 += src1_stride_y;

2021

acc0 = fma(b0, (half8)a0.s3, acc0);

2022

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2023

acc1 = fma(b0, (half8)a1.s3, acc1);

2024

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2025

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2026

acc2 = fma(b0, (half8)a2.s3, acc2);

2027

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2028

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2029

acc3 = fma(b0, (half8)a3.s3, acc3);

2030

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2031

2032

src_addr.s0 += 4 * sizeof(half);

2033

}

2034

2035

for(; i < (int)COLS_A; ++i)

2036

{

2037

// Load values from matrix A

2038

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

2039

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2040

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

2041

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2042

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2043

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

2044

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2045

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2046

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

2047

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2048

// Load values from matrix B

2049

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

2050

2051

src_addr += (int2)(sizeof(half), src1_stride_y);

2052

2053

// Accumulate

2054

acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;

2055

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2056

acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;

2057

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2058

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2059

acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;

2060

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2061

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2062

acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;

2063

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2064

}

2065

2066

// Compute destination address

2067

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2068

2069

// Compute dst address

2070

__global uchar *dst_addr = offset(&dst, 0, 0);

2071

2072

// Add offset for batched GEMM

2073

dst_addr += get_global_id(2) * dst_stride_z;

2074

2075

// Multiply by the weight of matrix-matrix product and store the result

2076

#if defined(ALPHA)

2077

acc0 = acc0 * (half8)ALPHA;

2078

#endif // defined(ALPHA)

2079

vstore8(acc0, 0, (__global half *)(dst_addr + 0 * dst_stride_y));

2080

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2081

#if defined(ALPHA)

2082

acc1 = acc1 * (half8)ALPHA;

2083

#endif // defined(ALPHA)

2084

vstore8(acc1, 0, (__global half *)(dst_addr + 1 * dst_stride_y));

2085

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2086

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2087

#if defined(ALPHA)

2088

acc2 = acc2 * (half8)ALPHA;

2089

#endif // defined(ALPHA)

2090

vstore8(acc2, 0, (__global half *)(dst_addr + 2 * dst_stride_y));

2091

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2092

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2093

#if defined(ALPHA)

2094

acc3 = acc3 * (half8)ALPHA;

2095

#endif // defined(ALPHA)

2096

vstore8(acc3, 0, (__global half *)(dst_addr + 3 * dst_stride_y));

2097

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2098

}

2099

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2100

#if defined(FIXED_POINT_POSITION)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2101

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2102

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2103

* @note This OpenCL kernel works with fixed point data types QS8

2104

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2105

* @note The number matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2106

* @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2107

* @note The optional alpha value must be passed in 8 bit fixed point format using -DALPHA

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

2108

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

2109

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2110

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2111

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8/QS16

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2112

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

2113

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2114

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

2115

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2116

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

2117

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

2118

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

2119

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2120

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

2121

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2122

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

2123

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

2124

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2125

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2126

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2127

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2128

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2129

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2130

__kernel void gemm_mm_qs8(IMAGE_DECLARATION(src0),

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2131

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2132

IMAGE_DECLARATION(dst),

2133

uint src0_stride_z,

2134

uint src1_stride_z,

2135

uint dst_stride_z)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2136

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2137

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2138

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2139

// Compute starting address for matrix A and Matrix B

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2140

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2141

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2142

// Update address for the matrix A

2143

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2144

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2145

// Update address for the matrix B

2146

src_addr.s1 += idx * sizeof(char);

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2147

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2148

// Add offset for batched GEMM

2149

src_addr.s0 += get_global_id(2) * src0_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

2150

2151

#if defined(MATRIX_B_DEPTH)

2152

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2153

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

2154

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2155

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

2156

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2157

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2158

int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(char));

short8 acc00 = 0;

short8 acc01 = 0;

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2163

short8 acc10 = 0;

2164

short8 acc11 = 0;

2165

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2166

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2167

short8 acc20 = 0;

2168

short8 acc21 = 0;

2169

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2170

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2171

short8 acc30 = 0;

2172

short8 acc31 = 0;

2173

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2174

2175

// This for loop performs 4 accumulations per iteration

2176

for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2177

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2178

char2 a0 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

2179

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2180

char2 a1 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

2181

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2182

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2183

char2 a2 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

2184

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2185

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2186

char2 a3 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

2187

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2188

char16 b0 = vload16(0, (__global char *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));

2189

char16 b1 = vload16(0, (__global char *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2190

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2191

acc00 = mlal_sat_qs8x8(acc00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);

2192

acc00 = mlal_sat_qs8x8(acc00, (char8)a0.s1, b1.s01234567, FIXED_POINT_POSITION);

2193

acc01 = mlal_sat_qs8x8(acc01, (char8)a0.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);

2194

acc01 = mlal_sat_qs8x8(acc01, (char8)a0.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);

2195

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2196

acc10 = mlal_sat_qs8x8(acc10, (char8)a1.s0, b0.s01234567, FIXED_POINT_POSITION);

2197

acc10 = mlal_sat_qs8x8(acc10, (char8)a1.s1, b1.s01234567, FIXED_POINT_POSITION);

2198

acc11 = mlal_sat_qs8x8(acc11, (char8)a1.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);

2199

acc11 = mlal_sat_qs8x8(acc11, (char8)a1.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);

2200

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2201

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2202

acc20 = mlal_sat_qs8x8(acc20, (char8)a2.s0, b0.s01234567, FIXED_POINT_POSITION);

2203

acc20 = mlal_sat_qs8x8(acc20, (char8)a2.s1, b1.s01234567, FIXED_POINT_POSITION);

2204

acc21 = mlal_sat_qs8x8(acc21, (char8)a2.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);

2205

acc21 = mlal_sat_qs8x8(acc21, (char8)a2.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);

2206

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2207

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2208

acc30 = mlal_sat_qs8x8(acc30, (char8)a3.s0, b0.s01234567, FIXED_POINT_POSITION);

2209

acc30 = mlal_sat_qs8x8(acc30, (char8)a3.s1, b1.s01234567, FIXED_POINT_POSITION);

2210

acc31 = mlal_sat_qs8x8(acc31, (char8)a3.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);

2211

acc31 = mlal_sat_qs8x8(acc31, (char8)a3.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);

2212

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2213

}

2214

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2215

// Left-over accumulations

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2216

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))

2217

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2218

char a0 = *((__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

2219

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2220

char a1 = *((__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

2221

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2222

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2223

char a2 = *((__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

2224

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2225

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2226

char a3 = *((__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

2227

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2228

char16 b0 = vload16(0, (__global char *)(src1_ptr + src_addr.s1));

2229

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2230

acc00 = mlal_sat_qs8x8(acc00, (char8)a0, b0.s01234567, FIXED_POINT_POSITION);

2231

acc01 = mlal_sat_qs8x8(acc01, (char8)a0, b0.s89ABCDEF, FIXED_POINT_POSITION);

2232

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2233

acc10 = mlal_sat_qs8x8(acc10, (char8)a1, b0.s01234567, FIXED_POINT_POSITION);

2234

acc11 = mlal_sat_qs8x8(acc11, (char8)a1, b0.s89ABCDEF, FIXED_POINT_POSITION);

2235

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2236

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2237

acc20 = mlal_sat_qs8x8(acc20, (char8)a2, b0.s01234567, FIXED_POINT_POSITION);

2238

acc21 = mlal_sat_qs8x8(acc21, (char8)a2, b0.s89ABCDEF, FIXED_POINT_POSITION);

2239

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2240

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2241

acc30 = mlal_sat_qs8x8(acc30, (char8)a3, b0.s01234567, FIXED_POINT_POSITION);

2242

acc31 = mlal_sat_qs8x8(acc31, (char8)a3, b0.s89ABCDEF, FIXED_POINT_POSITION);

2243

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2244

}

2245

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2246

// Compute destination address

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2247

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2248

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2249

// Compute dst address

2250

__global uchar *dst_addr = offset(&dst, 0, 0);

2251

2252

// Add offset for batched GEMM

2253

dst_addr += get_global_id(2) * dst_stride_z;

2254

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2255

// Multiply by the weight of matrix product and store the result

2256

char16 acc_qs8;

2257

acc_qs8 = convert_char16_sat((short16)(acc00, acc01));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2258

#if defined(ALPHA)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2259

acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2260

#endif // defined(ALPHA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2261

vstore16(acc_qs8, 0, (__global char *)(dst_addr + 0 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2262

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2263

acc_qs8 = convert_char16_sat((short16)(acc10, acc11));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2264

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2265

acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2266

#endif // defined(ALPHA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2267

vstore16(acc_qs8, 0, (__global char *)(dst_addr + 1 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2268

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2269

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2270

acc_qs8 = convert_char16_sat((short16)(acc20, acc21));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2271

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2272

acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2273

#endif // defined(ALPHA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2274

vstore16(acc_qs8, 0, (__global char *)(dst_addr + 2 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2275

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2276

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2277

acc_qs8 = convert_char16_sat((short16)(acc30, acc31));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2278

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2279

acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2280

#endif // defined(ALPHA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2281

vstore16(acc_qs8, 0, (__global char *)(dst_addr + 3 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2282

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2283

}

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2284

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2285

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2286

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2287

* @note This OpenCL kernel works with fixed point data types QS16

2288

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2289

* @note The number of matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2290

* @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2291

* @note The optional alpha value must be passed in 16 bit fixed point format using -DALPHA

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

2292

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

2293

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2294

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2295

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8/QS16

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2296

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

2297

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2298

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

2299

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2300

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

2301

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

2302

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

2303

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2304

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

2305

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2306

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

2307

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

2308

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2309

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2310

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2311

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2312

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2313

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2314

__kernel void gemm_mm_qs16(IMAGE_DECLARATION(src0),

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2315

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2316

IMAGE_DECLARATION(dst),

2317

uint src0_stride_z,

2318

uint src1_stride_z,

2319

uint dst_stride_z)

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2320

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2321

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2322

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2323

// Compute starting address for matrix A and Matrix B

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2324

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2325

2326

// Update address for the matrix A

2327

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

2328

2329

// Update address for the matrix B

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2330

src_addr.s1 += idx * sizeof(short);

2331

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2332

// Add offset for batched GEMM

2333

src_addr.s0 += get_global_id(2) * src0_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

2334

2335

#if defined(MATRIX_B_DEPTH)

2336

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2337

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

2338

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2339

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

2340

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2341

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2342

int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(short));

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2343

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2344

int8 acc0 = 0;

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2345

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2346

int8 acc1 = 0;

2347

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2348

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2349

int8 acc2 = 0;

2350

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2351

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2352

int8 acc3 = 0;

2353

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2354

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2355

// This for loop performs 4 accumulations per iteration

Georgios Pinitas

96880cf

2017-10-20 18:52:20 +0100

[diff] [blame]

2356

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(short)); src_addr += (int2)(2 * sizeof(short), 2 * src1_stride_y))

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2357

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2358

short2 a0 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

2359

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2360

short2 a1 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

2361

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2362

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2363

short2 a2 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

2364

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2365

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2366

short2 a3 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

2367

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2368

short8 b0 = vload8(0, (__global short *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));

2369

short8 b1 = vload8(0, (__global short *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2370

2371

acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s0, b0, FIXED_POINT_POSITION);

2372

acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s1, b1, FIXED_POINT_POSITION);

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2373

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2374

acc1 = mlal_sat_qs16x8(acc1, (short8)a1.s0, b0, FIXED_POINT_POSITION);

2375

acc1 = mlal_sat_qs16x8(acc1, (short8)a1.s1, b1, FIXED_POINT_POSITION);

2376

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2377

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2378

acc2 = mlal_sat_qs16x8(acc2, (short8)a2.s0, b0, FIXED_POINT_POSITION);

2379

acc2 = mlal_sat_qs16x8(acc2, (short8)a2.s1, b1, FIXED_POINT_POSITION);

2380

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2381

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2382

acc3 = mlal_sat_qs16x8(acc3, (short8)a3.s0, b0, FIXED_POINT_POSITION);

2383

acc3 = mlal_sat_qs16x8(acc3, (short8)a3.s1, b1, FIXED_POINT_POSITION);

2384

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2385

}

2386

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2387

// Left-over accumulations

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2388

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(short), src1_stride_y))

2389

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2390

short a0 = *((__global short *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

2391

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2392

short a1 = *((__global short *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

2393

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2394

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2395

short a2 = *((__global short *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

2396

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2397

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2398

short a3 = *((__global short *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

2399

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2400

short8 b0 = vload8(0, (__global short *)(src1_ptr + src_addr.s1));

2401

2402

acc0 = mlal_sat_qs16x8(acc0, (short8)a0, b0, FIXED_POINT_POSITION);

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2403

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2404

acc1 = mlal_sat_qs16x8(acc1, (short8)a1, b0, FIXED_POINT_POSITION);

2405

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2406

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2407

acc2 = mlal_sat_qs16x8(acc2, (short8)a2, b0, FIXED_POINT_POSITION);

2408

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2409

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2410

acc3 = mlal_sat_qs16x8(acc3, (short8)a3, b0, FIXED_POINT_POSITION);

2411

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2412

}

2413

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2414

// Compute destination address

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2415

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2416

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2417

// Compute dst address

2418

__global uchar *dst_addr = offset(&dst, 0, 0);

2419

Gian Marco Iodice

81b28c4

2018-03-29 10:29:36 +0100

[diff] [blame]

2420

// Add offset for batched GEMM

2421

dst_addr += get_global_id(2) * dst_stride_z;

2422

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2423

// Multiply by the weight of matrix product and store the result

2424

short8 acc_qs16;

2425

acc_qs16 = convert_short8_sat(acc0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2426

#if defined(ALPHA)

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2427

acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2428

#endif // defined(ALPHA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2429

vstore8(acc_qs16, 0, (__global short *)(dst_addr + 0 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2430

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2431

acc_qs16 = convert_short8_sat(acc1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2432

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2433

acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2434

#endif // defined(ALPHA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2435

vstore8(acc_qs16, 0, (__global short *)(dst_addr + 1 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2436

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2437

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2438

acc_qs16 = convert_short8_sat(acc2);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2439

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2440

acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2441

#endif // defined(ALPHA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2442

vstore8(acc_qs16, 0, (__global short *)(dst_addr + 2 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2443

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2444

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2445

acc_qs16 = convert_short8_sat(acc3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2446

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2447

acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2448

#endif // defined(ALPHA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2449

vstore8(acc_qs16, 0, (__global short *)(dst_addr + 3 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2450

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2451

}

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2452

#endif // defined(FIXED_POINT_POSITION)

2453

#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2454

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2455

#if defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2456

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

2457

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

2458

* @note The beta's value need to be passed at compile time using -DBETA

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2459

*

2460

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F32

2461

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

2462

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2463

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

2464

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2465

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2466

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2467

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2468

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2469

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2470

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2471

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2472

*/

2473

__kernel void gemm_ma_f32(IMAGE_DECLARATION(src),

2474

IMAGE_DECLARATION(dst))

2475

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2476

// Compute source and destination addresses

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2477

Image src = CONVERT_TO_IMAGE_STRUCT(src);

2478

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2479

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2480

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2481

float4 alpha_ab = vload4(0, (__global float *)dst.ptr);

2482

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2483

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2484

float4 c = vload4(0, (__global float *)src.ptr);

2485

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2486

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2487

float4 out = alpha_ab + (float4)BETA * c;

2488

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2489

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2490

vstore4(out, 0, (__global float *)dst.ptr);

2491

}

2492

2493

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

2494

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

2495

* @note The beta's value need to be passed at compile time using -DBETA

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2496

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2497

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F16

2498

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

2499

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2500

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

2501

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2502

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2503

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2504

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2505

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2506

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2507

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2508

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2509

*/

2510

__kernel void gemm_ma_f16(IMAGE_DECLARATION(src),

2511

IMAGE_DECLARATION(dst))

2512

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2513

// Compute source and destination addresses

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2514

Image src = CONVERT_TO_IMAGE_STRUCT(src);

2515

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2516

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2517

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2518

half8 alpha_ab = vload8(0, (__global half *)dst.ptr);

2519

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2520

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2521

half8 c = vload8(0, (__global half *)src.ptr);

2522

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2523

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2524

half8 out = alpha_ab + (half8)BETA * c;

2525

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2526

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2527

vstore8(out, 0, (__global half *)dst.ptr);

2528

}

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2529

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2530

#if defined(FIXED_POINT_POSITION)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2531

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices in 8 bit fixed point taking into account that the second matrix might be weighted by a scalar value beta:

2532

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

2533

* @note The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2534

*

2535

* @note: BETA must be passed in 8 bit fixed point format

2536

*

2537

* @param[in] src_ptr Pointer to the source matrix. Supported data types: QS8

2538

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

2539

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2540

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

2541

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2542

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

2543

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

2544

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2545

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2546

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2547

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2548

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2549

*/

2550

__kernel void gemm_ma_qs8(IMAGE_DECLARATION(src),

2551

IMAGE_DECLARATION(dst))

2552

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2553

// Compute source and destination addresses

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2554

Image src = CONVERT_TO_IMAGE_STRUCT(src);

2555

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2556

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2557

// Load values from A x B

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2558

char16 alpha_ab = vload16(0, (__global char *)dst.ptr);

2559

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2560

// Load values from Matrix C

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2561

char16 c = vload16(0, (__global char *)src.ptr);

2562

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2563

// Computes alpha * axb + beta * c

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2564

char16 out = mla_sat_qs8x16(alpha_ab, (char16)BETA, c, FIXED_POINT_POSITION);

2565

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2566

// Store final result in axb matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2567

vstore16(out, 0, (__global char *)dst.ptr);

2568

}

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2569

2570

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices in 16 bit fixed point taking into account that the second matrix might be weighted by a scalar value beta:

2571

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

2572

* @note The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2573

*

2574

* @note: BETA must be passed in 16 bit fixed point format

2575

*

2576

* @param[in] src_ptr Pointer to the source matrix. Supported data types: QS16

2577

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

2578

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2579

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

2580

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2581

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

2582

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

2583

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2584

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2585

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2586

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2587

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2588

*/

2589

__kernel void gemm_ma_qs16(IMAGE_DECLARATION(src),

2590

IMAGE_DECLARATION(dst))

2591

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2592

// Compute source and destination addresses

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2593

Image src = CONVERT_TO_IMAGE_STRUCT(src);

2594

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2595

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2596

// Load values from A x B

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2597

short8 alpha_ab = vload8(0, (__global short *)dst.ptr);

2598

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2599

// Load values from Matrix C

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2600

short8 c = vload8(0, (__global short *)src.ptr);

2601

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2602

// Computes alpha * axb + beta * c

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2603

short8 out = mla_sat_qs16x8(alpha_ab, (short8)BETA, c, FIXED_POINT_POSITION);

2604

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2605

// Store final result in axb matrix

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2606

vstore8(out, 0, (__global short *)dst.ptr);

2607

}

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2608

#endif // defined(FIXED_POINT_POSITION)

2609

#endif // defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2610

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2611

#if defined(WIDTH_VECTOR_A)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2612

/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer

2613

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

2614

* @note The width of A need to be passed at compile time using -DWIDTH_VECTOR_A

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2615

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

2616

* @note The input A and matrix B must not be reshaped

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2617

*

2618

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

2619

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

2620

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2621

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

2622

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2623

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2624

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2625

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

2626

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2627

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

2628

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2629

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

2630

* @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2631

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2632

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2633

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2634

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2635

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2636

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2637

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2638

*/

2639

__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),

2640

TENSOR3D_DECLARATION(src1),

2641

IMAGE_DECLARATION(dst))

2642

{

2643

int idx = get_global_id(0) * 4;

2644

int idy = get_global_id(1);

2645

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2646

// Compute the address for the vector A and matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2647

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));

2648

src_addr.s1 += idx * sizeof(float);

2649

2650

int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));

float4 acc = 0.0f;

Georgios Pinitas

2017-10-20 18:52:20 +0100

[diff] [blame]

2654

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2655

{

2656

float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));

2657

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

2658

float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));

2659

2660

acc += b0 * (float4)a0.s0;

2661

acc += b1 * (float4)a0.s1;

2662

}

2663

2664

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))

2665

{

2666

float a0 = *((__global float *)(src0_ptr + src_addr.s0));

2667

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

2668

2669

acc += b0 * (float4)a0;

2670

}

2671

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2672

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2673

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2674

2675

vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));

2676

}

Anton Lokhmotov