Blame - src/core/CL/cl_kernels/gemm.cl - ml/ComputeLibrary

2018-01-12 10:21:40 +0000

[diff] [blame]

46

*

47

* @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

48

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

49

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

50

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

51

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

52

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

53

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

54

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

55

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

56

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

57

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

58

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

59

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

60

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

61

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

62

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

63

*/

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

64

__kernel void gemm_transpose1xW(TENSOR3D_DECLARATION(src),

65

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

66

{

67

uint x = get_global_id(0);

68

uint y = get_global_id(1);

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

69

uint z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

70

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

71

// Compute address for Matrix B - source

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

72

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

73

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

74

// Compute address for Matrix B transposed - destination. X and Y are swapped

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

75

uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + y * TRANSPOSE_W * sizeof(DATA_TYPE) * MULT_TRANSPOSE1XW_WIDTH + (x / MULT_TRANSPOSE1XW_WIDTH) * dst_stride_y +

76

(x % MULT_TRANSPOSE1XW_WIDTH) * TRANSPOSE_W * sizeof(DATA_TYPE);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

77

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

78

// Add offset for batched GEMM

79

dst_addr_in_bytes += z * dst_stride_z;

80

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

81

VEC_DATA_TYPE(DATA_TYPE, TRANSPOSE_W)

82

b0 = VLOAD(TRANSPOSE_W)(0, (__global DATA_TYPE *)src.ptr);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

83

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

84

VSTORE(TRANSPOSE_W)

85

(b0, 0, (__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

86

}

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

87

#endif // defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

88

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

89

#if defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)

90

91

/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

92

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

93

* @note The data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=float)

94

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

95

*

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

96

* @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

97

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

98

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

99

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

100

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

101

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

102

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

103

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

104

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

105

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

106

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

107

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

108

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

109

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

110

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

111

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

112

*/

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

113

__kernel void gemm_interleave4x4(TENSOR3D_DECLARATION(src),

114

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

115

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

116

// Compute source and destination addresses

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

117

uint x = get_global_id(0);

118

uint y = get_global_id(1);

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

119

uint z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

120

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

121

// Compute address for source tensor

122

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

123

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

124

// Compute address for Matrix B transposed - destination. X and Y are swapped

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

125

uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * 16 * MULT_INTERLEAVE4X4_HEIGHT + (y / MULT_INTERLEAVE4X4_HEIGHT) * dst_stride_y +

126

(y % MULT_INTERLEAVE4X4_HEIGHT) * 4 * sizeof(DATA_TYPE);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

127

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

128

// Add offset for batched GEMM

129

dst_addr_in_bytes += z * dst_stride_z;

130

131

__global uchar *input_ptr = src.ptr;

132

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

133

// Load values from Matrix A

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

134

VEC_DATA_TYPE(DATA_TYPE, 4)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

135

a0 = vload4(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

136

VEC_DATA_TYPE(DATA_TYPE, 4)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

137

a1 = vload4(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

138

VEC_DATA_TYPE(DATA_TYPE, 4)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

139

a2 = vload4(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

140

VEC_DATA_TYPE(DATA_TYPE, 4)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

141

a3 = vload4(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

142

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

143

VEC_DATA_TYPE(DATA_TYPE, 4)

144

val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s0, a1.s0, a2.s0, a3.s0);

145

vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 0 * MULT_INTERLEAVE4X4_HEIGHT));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

146

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

147

val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s1, a1.s1, a2.s1, a3.s1);

148

vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 4 * MULT_INTERLEAVE4X4_HEIGHT));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

149

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

150

val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s2, a1.s2, a2.s2, a3.s2);

151

vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 8 * MULT_INTERLEAVE4X4_HEIGHT));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

152

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

153

val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s3, a1.s3, a2.s3, a3.s3);

154

vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 12 * MULT_INTERLEAVE4X4_HEIGHT));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

155

}

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

156

#endif // defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

157

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

158

#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

159

/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

160

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

161

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

162

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

163

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)

164

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

165

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

166

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

167

*

168

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

169

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

170

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

171

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

172

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

173

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

174

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

175

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

176

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

177

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

178

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

179

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

180

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

181

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

182

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

183

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

184

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

185

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

186

*/

Gian Marco Iodice

bb36a8e

2018-04-19 12:05:08 +0100

[diff] [blame]

187

__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),

188

IMAGE_DECLARATION(src1),

189

IMAGE_DECLARATION(dst),

190

uint src0_stride_z,

191

uint src1_stride_z,

192

uint dst_stride_z)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

193

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

194

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

195

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

196

int z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

197

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

198

// Offset

199

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

200

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

201

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

202

// src_addr_a = address of matrix A

203

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

204

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

205

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

206

207

#if defined(MATRIX_B_DEPTH)

208

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

209

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

210

#else // defined(MATRIX_B_DEPTH)

211

src1_addr_in_bytes += z * src1_stride_z;

212

#endif // defined(MATRIX_B_DEPTH)

213

214

__global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);

215

__global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

216

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

217

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

218

__global float *src_end_addr_b = src_addr_b + COLS_B;

219

220

src_addr_a += offset_row_a;

221

src_addr_b += offset_row_b;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

222

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

223

// Reset accumulators

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

float4 c00 = 0.0f;

float4 c10 = 0.0f;

float4 c20 = 0.0f;

float4 c30 = 0.0f;

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

229

for(; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

230

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

231

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

232

float4 a0 = vload4(0, src_addr_a);

233

float4 b0 = vload4(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

234

235

c00 += (float4)a0.s0 * b0;

236

c10 += (float4)a0.s1 * b0;

237

c20 += (float4)a0.s2 * b0;

238

c30 += (float4)a0.s3 * b0;

239

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

240

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

241

a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);

242

b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

243

244

c00 += (float4)a0.s0 * b0;

245

c10 += (float4)a0.s1 * b0;

246

c20 += (float4)a0.s2 * b0;

247

c30 += (float4)a0.s3 * b0;

248

}

249

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

250

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

251

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

252

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

253

float4 a0 = vload4(0, src_addr_a);

254

float4 b0 = vload4(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

255

256

c00 += (float4)a0.s0 * b0;

257

c10 += (float4)a0.s1 * b0;

258

c20 += (float4)a0.s2 * b0;

259

c30 += (float4)a0.s3 * b0;

260

}

261

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

262

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

263

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

264

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

265

#if defined(ALPHA)

266

// Multiply by the weight of matrix product

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

267

c00 = c00 * (float4)ALPHA;

268

c10 = c10 * (float4)ALPHA;

269

c20 = c20 * (float4)ALPHA;

270

c30 = c30 * (float4)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

271

#endif // defined(ALPHA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

272

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

273

// Compute dst address

274

__global uchar *dst_addr = offset(&dst, 0, 0);

275

276

// Add offset for batched GEMM

277

dst_addr += z * dst_stride_z;

278

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

279

// Store 4x4 block

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

280

vstore4(c00, 0, (__global float *)(dst_addr + 0 * dst_stride_y));

281

vstore4(c10, 0, (__global float *)(dst_addr + 1 * dst_stride_y));

282

vstore4(c20, 0, (__global float *)(dst_addr + 2 * dst_stride_y));

283

vstore4(c30, 0, (__global float *)(dst_addr + 3 * dst_stride_y));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

284

}

285

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

286

/** This OpenCL kernel is optimized for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

287

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

288

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

289

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

290

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)

291

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

292

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

293

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

294

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

295

*

296

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

297

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

298

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

299

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

300

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

301

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

302

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

303

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

304

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

305

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

306

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

307

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

308

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

309

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

310

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

311

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

312

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

313

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

314

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

315

__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),

316

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

317

IMAGE_DECLARATION(dst),

318

uint src0_stride_z,

319

uint src1_stride_z,

320

uint dst_stride_z)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

321

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

322

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

323

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

324

int z = get_global_id(2);

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

325

326

// Offset

327

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

328

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;

329

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

330

// src_addr_a = address of matrix A

331

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

332

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

333

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

334

335

#if defined(MATRIX_B_DEPTH)

336

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

337

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

338

#else // defined(MATRIX_B_DEPTH)

339

src1_addr_in_bytes += z * src1_stride_z;

340

#endif // defined(MATRIX_B_DEPTH)

341

342

__global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);

343

__global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

344

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

345

src_addr_a += offset_row_a;

346

src_addr_b += offset_row_b;

347

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

348

// Reset accumulators

float c00 = 0.0f;

float c01 = 0.0f;

float c02 = 0.0f;

float c03 = 0.0f;

float c10 = 0.0f;

float c11 = 0.0f;

float c12 = 0.0f;

float c13 = 0.0f;

float c20 = 0.0f;

float c21 = 0.0f;

float c22 = 0.0f;

float c23 = 0.0f;

float c30 = 0.0f;

float c31 = 0.0f;

float c32 = 0.0f;

float c33 = 0.0f;

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

366

#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH))

367

368

int i = 0;

369

for(; i <= (int)(COLS_MTX_B - 4); i += 4)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

370

{

371

// Load values from matrix A (interleaved) and matrix B (transposed)

372

float4 a0 = vload4(0, src_addr_a);

373

float4 b0 = vload4(0, src_addr_b);

374

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

375

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

376

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

377

378

c00 = fma(a0.s0, b0.s0, c00);

379

c01 = fma(a0.s0, b0.s1, c01);

380

c02 = fma(a0.s0, b0.s2, c02);

381

c03 = fma(a0.s0, b0.s3, c03);

382

383

c10 = fma(a0.s1, b0.s0, c10);

384

c11 = fma(a0.s1, b0.s1, c11);

385

c12 = fma(a0.s1, b0.s2, c12);

386

c13 = fma(a0.s1, b0.s3, c13);

387

388

c20 = fma(a0.s2, b0.s0, c20);

389

c21 = fma(a0.s2, b0.s1, c21);

390

c22 = fma(a0.s2, b0.s2, c22);

391

c23 = fma(a0.s2, b0.s3, c23);

392

393

c30 = fma(a0.s3, b0.s0, c30);

394

c31 = fma(a0.s3, b0.s1, c31);

395

c32 = fma(a0.s3, b0.s2, c32);

396

c33 = fma(a0.s3, b0.s3, c33);

397

398

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

399

a0 = vload4(0, src_addr_a);

400

b0 = vload4(0, src_addr_b);

401

402

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

403

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

404

405

c00 = fma(a0.s0, b0.s0, c00);

406

c01 = fma(a0.s0, b0.s1, c01);

407

c02 = fma(a0.s0, b0.s2, c02);

408

c03 = fma(a0.s0, b0.s3, c03);

409

410

c10 = fma(a0.s1, b0.s0, c10);

411

c11 = fma(a0.s1, b0.s1, c11);

412

c12 = fma(a0.s1, b0.s2, c12);

413

c13 = fma(a0.s1, b0.s3, c13);

414

415

c20 = fma(a0.s2, b0.s0, c20);

416

c21 = fma(a0.s2, b0.s1, c21);

417

c22 = fma(a0.s2, b0.s2, c22);

418

c23 = fma(a0.s2, b0.s3, c23);

419

420

c30 = fma(a0.s3, b0.s0, c30);

421

c31 = fma(a0.s3, b0.s1, c31);

422

c32 = fma(a0.s3, b0.s2, c32);

423

c33 = fma(a0.s3, b0.s3, c33);

424

425

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

426

a0 = vload4(0, src_addr_a);

427

b0 = vload4(0, src_addr_b);

428

429

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

430

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

431

432

c00 = fma(a0.s0, b0.s0, c00);

433

c01 = fma(a0.s0, b0.s1, c01);

434

c02 = fma(a0.s0, b0.s2, c02);

435

c03 = fma(a0.s0, b0.s3, c03);

436

437

c10 = fma(a0.s1, b0.s0, c10);

438

c11 = fma(a0.s1, b0.s1, c11);

439

c12 = fma(a0.s1, b0.s2, c12);

440

c13 = fma(a0.s1, b0.s3, c13);

441

442

c20 = fma(a0.s2, b0.s0, c20);

443

c21 = fma(a0.s2, b0.s1, c21);

444

c22 = fma(a0.s2, b0.s2, c22);

445

c23 = fma(a0.s2, b0.s3, c23);

446

447

c30 = fma(a0.s3, b0.s0, c30);

448

c31 = fma(a0.s3, b0.s1, c31);

449

c32 = fma(a0.s3, b0.s2, c32);

450

c33 = fma(a0.s3, b0.s3, c33);

451

452

// Load values from matrix A (interleaved) and matrix B (transposed)

453

a0 = vload4(0, src_addr_a);

454

b0 = vload4(0, src_addr_b);

455

456

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

457

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

458

459

c00 = fma(a0.s0, b0.s0, c00);

460

c01 = fma(a0.s0, b0.s1, c01);

461

c02 = fma(a0.s0, b0.s2, c02);

462

c03 = fma(a0.s0, b0.s3, c03);

463

464

c10 = fma(a0.s1, b0.s0, c10);

465

c11 = fma(a0.s1, b0.s1, c11);

466

c12 = fma(a0.s1, b0.s2, c12);

467

c13 = fma(a0.s1, b0.s3, c13);

468

469

c20 = fma(a0.s2, b0.s0, c20);

470

c21 = fma(a0.s2, b0.s1, c21);

471

c22 = fma(a0.s2, b0.s2, c22);

472

c23 = fma(a0.s2, b0.s3, c23);

473

474

c30 = fma(a0.s3, b0.s0, c30);

475

c31 = fma(a0.s3, b0.s1, c31);

476

c32 = fma(a0.s3, b0.s2, c32);

477

c33 = fma(a0.s3, b0.s3, c33);

478

}

479

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

480

for(; i < (int)(COLS_MTX_B); ++i)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

481

{

482

// Load values from matrix A (interleaved) and matrix B (transposed)

483

float4 a0 = vload4(0, src_addr_a);

484

float4 b0 = vload4(0, src_addr_b);

485

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

486

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

487

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

488

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

489

c00 = fma(a0.s0, b0.s0, c00);

490

c01 = fma(a0.s0, b0.s1, c01);

491

c02 = fma(a0.s0, b0.s2, c02);

492

c03 = fma(a0.s0, b0.s3, c03);

493

494

c10 = fma(a0.s1, b0.s0, c10);

495

c11 = fma(a0.s1, b0.s1, c11);

496

c12 = fma(a0.s1, b0.s2, c12);

497

c13 = fma(a0.s1, b0.s3, c13);

498

499

c20 = fma(a0.s2, b0.s0, c20);

500

c21 = fma(a0.s2, b0.s1, c21);

501

c22 = fma(a0.s2, b0.s2, c22);

502

c23 = fma(a0.s2, b0.s3, c23);

503

504

c30 = fma(a0.s3, b0.s0, c30);

505

c31 = fma(a0.s3, b0.s1, c31);

506

c32 = fma(a0.s3, b0.s2, c32);

507

c33 = fma(a0.s3, b0.s3, c33);

508

}

509

510

// Compute destination address

511

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

512

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

513

#if defined(ALPHA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

514

// Multiply by the weight of matrix product

c00 = c00 * ALPHA;

c01 = c01 * ALPHA;

c02 = c02 * ALPHA;

c03 = c03 * ALPHA;

c10 = c10 * ALPHA;

c11 = c11 * ALPHA;

c12 = c12 * ALPHA;

c13 = c13 * ALPHA;

c20 = c20 * ALPHA;

c21 = c21 * ALPHA;

c22 = c22 * ALPHA;

c23 = c23 * ALPHA;

c30 = c30 * ALPHA;

c31 = c31 * ALPHA;

c32 = c32 * ALPHA;

c33 = c33 * ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

531

#endif // defined(ALPHA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

532

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

533

// Compute dst address

534

__global uchar *dst_addr = offset(&dst, 0, 0);

535

536

// Add offset for batched GEMM

537

dst_addr += z * dst_stride_z;

538

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

539

// Store 4x4 block

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

540

vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(dst_addr + 0 * dst_stride_y));

541

vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(dst_addr + 1 * dst_stride_y));

542

vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(dst_addr + 2 * dst_stride_y));

543

vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(dst_addr + 3 * dst_stride_y));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

544

}

545

Georgios Pinitas

8422558

2018-05-14 12:00:05 +0100

[diff] [blame]

546

// Undefine local defines

547

#undef COLS_MTX_B

548

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

549

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

550

/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

551

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

552

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

553

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

554

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)

555

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

556

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

557

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

558

*

559

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

560

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

561

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

562

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

563

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

564

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

565

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

566

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

567

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

568

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

569

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

570

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

571

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

572

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

573

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

574

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

575

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

576

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

577

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

578

__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),

579

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

580

IMAGE_DECLARATION(dst),

581

uint src0_stride_z,

582

uint src1_stride_z,

583

uint dst_stride_z)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

584

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

585

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

586

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

587

int z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

588

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

589

// Offset

590

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

591

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

592

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

593

// src_addr_a = address of matrix A

594

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

595

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

596

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

597

598

#if defined(MATRIX_B_DEPTH)

599

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

600

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

601

#else // defined(MATRIX_B_DEPTH)

602

src1_addr_in_bytes += z * src1_stride_z;

603

#endif // defined(MATRIX_B_DEPTH)

604

605

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

606

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

607

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

608

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

609

__global half *src_end_addr_b = src_addr_b + COLS_B;

610

611

src_addr_a += offset_row_a;

612

src_addr_b += offset_row_b;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

613

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

614

// Reset accumulators

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

half8 c00 = 0.0f;

half8 c10 = 0.0f;

half8 c20 = 0.0f;

half8 c30 = 0.0f;

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

620

for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

621

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

622

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

623

half4 a0 = vload4(0, src_addr_a);

624

half8 b0 = vload8(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

625

626

c00 += (half8)a0.s0 * b0;

627

c10 += (half8)a0.s1 * b0;

628

c20 += (half8)a0.s2 * b0;

629

c30 += (half8)a0.s3 * b0;

630

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

631

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

632

a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);

633

b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

634

635

c00 += (half8)a0.s0 * b0;

636

c10 += (half8)a0.s1 * b0;

637

c20 += (half8)a0.s2 * b0;

638

c30 += (half8)a0.s3 * b0;

639

}

640

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

641

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

642

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

643

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

644

half4 a0 = vload4(0, src_addr_a);

645

half8 b0 = vload8(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

646

647

c00 += (half8)a0.s0 * b0;

648

c10 += (half8)a0.s1 * b0;

649

c20 += (half8)a0.s2 * b0;

650

c30 += (half8)a0.s3 * b0;

651

}

652

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

653

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

654

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

655

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

656

#if defined(ALPHA)

657

// Multiply by the weight of matrix product

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

658

c00 = c00 * (half8)ALPHA;

659

c10 = c10 * (half8)ALPHA;

660

c20 = c20 * (half8)ALPHA;

661

c30 = c30 * (half8)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

662

#endif // defined(ALPHA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

663

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

664

// Compute dst address

665

__global uchar *dst_addr = offset(&dst, 0, 0);

666

667

// Add offset for batched GEMM

668

dst_addr += z * dst_stride_z;

669

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

670

// Store 4x8 block

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

671

vstore8(c00, 0, (__global half *)(dst_addr + 0 * dst_stride_y));

672

vstore8(c10, 0, (__global half *)(dst_addr + 1 * dst_stride_y));

673

vstore8(c20, 0, (__global half *)(dst_addr + 2 * dst_stride_y));

674

vstore8(c30, 0, (__global half *)(dst_addr + 3 * dst_stride_y));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

675

}

Gian Marco Iodice

bb36a8e

2018-04-19 12:05:08 +0100

[diff] [blame]

676

677

/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A (src0) and matrix B (src1)

678

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication

679

*

680

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

681

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)

682

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

683

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

684

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

685

*

686

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

687

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

688

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

689

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

690

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

691

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

692

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

693

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

694

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

695

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

696

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

697

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

698

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

699

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

700

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

701

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

702

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

703

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

704

*/

705

__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),

706

IMAGE_DECLARATION(src1),

707

IMAGE_DECLARATION(dst),

uint src0_stride_z,

uint src1_stride_z,

uint dst_stride_z)

{

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

713

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

714

int z = get_global_id(2);

715

716

// Offset

717

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

718

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

719

720

// src_addr_a = address of matrix A

721

// src_addr_b = address of matrix B

722

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

723

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

724

725

#if defined(MATRIX_B_DEPTH)

726

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

727

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

728

#else // defined(MATRIX_B_DEPTH)

729

src1_addr_in_bytes += z * src1_stride_z;

730

#endif // defined(MATRIX_B_DEPTH)

731

732

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

733

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

734

735

// Compute end row address for matrix B

736

__global half *src_end_addr_b = src_addr_b + COLS_B;

737

738

src_addr_a += offset_row_a;

739

src_addr_b += offset_row_b;

740

741

// Reset accumulators

half8 c00 = 0.0f;

half8 c10 = 0.0f;

half8 c20 = 0.0f;

half8 c30 = 0.0f;

#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH))

748

749

int i = 0;

750

for(; i <= (int)(COLS_MTX_B - 4); i += 4)

751

{

752

#if MULT_INTERLEAVE4X4_HEIGHT == 1

753

// Load values from matrix A (interleaved) and matrix B (transposed)

754

half8 a0 = vload8(0, src_addr_a);

755

half8 b0 = vload8(0, src_addr_b);

756

757

src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;

758

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

759

760

c00 = fma((half8)a0.s0, b0, c00);

761

c10 = fma((half8)a0.s1, b0, c10);

762

c20 = fma((half8)a0.s2, b0, c20);

763

c30 = fma((half8)a0.s3, b0, c30);

764

765

// Load values from matrix B (transposed)

766

b0 = vload8(0, src_addr_b);

767

768

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

769

770

c00 = fma((half8)a0.s4, b0, c00);

771

c10 = fma((half8)a0.s5, b0, c10);

772

c20 = fma((half8)a0.s6, b0, c20);

773

c30 = fma((half8)a0.s7, b0, c30);

774

775

// Load values from matrix A (interleaved) and matrix B (transposed)

776

a0 = vload8(0, src_addr_a);

777

b0 = vload8(0, src_addr_b);

778

779

src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;

780

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

781

782

c00 = fma((half8)a0.s0, b0, c00);

783

c10 = fma((half8)a0.s1, b0, c10);

784

c20 = fma((half8)a0.s2, b0, c20);

785

c30 = fma((half8)a0.s3, b0, c30);

786

787

// Load values from matrix B (transposed)

788

b0 = vload8(0, src_addr_b);

789

790

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

791

792

c00 = fma((half8)a0.s4, b0, c00);

793

c10 = fma((half8)a0.s5, b0, c10);

794

c20 = fma((half8)a0.s6, b0, c20);

795

c30 = fma((half8)a0.s7, b0, c30);

796

#else // MULT_INTERLEAVE4X4_HEIGHT == 1

797

// Load values from matrix A (interleaved) and matrix B (transposed)

798

half4 a0 = vload4(0, src_addr_a);

799

half8 b0 = vload8(0, src_addr_b);

800

801

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

802

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

803

804

c00 = fma((half8)a0.s0, b0, c00);

805

c10 = fma((half8)a0.s1, b0, c10);

806

c20 = fma((half8)a0.s2, b0, c20);

807

c30 = fma((half8)a0.s3, b0, c30);

808

809

// Load values from matrix A (interleaved) and matrix B (transposed)

810

a0 = vload4(0, src_addr_a);

811

b0 = vload8(0, src_addr_b);

812

813

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

814

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

815

816

c00 = fma((half8)a0.s0, b0, c00);

817

c10 = fma((half8)a0.s1, b0, c10);

818

c20 = fma((half8)a0.s2, b0, c20);

819

c30 = fma((half8)a0.s3, b0, c30);

820

821

// Load values from matrix A (interleaved) and matrix B (transposed)

822

a0 = vload4(0, src_addr_a);

823

b0 = vload8(0, src_addr_b);

824

825

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

826

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

827

828

c00 = fma((half8)a0.s0, b0, c00);

829

c10 = fma((half8)a0.s1, b0, c10);

830

c20 = fma((half8)a0.s2, b0, c20);

831

c30 = fma((half8)a0.s3, b0, c30);

832

833

// Load values from matrix A (interleaved) and matrix B (transposed)

834

a0 = vload4(0, src_addr_a);

835

b0 = vload8(0, src_addr_b);

836

837

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

838

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

839

840

c00 = fma((half8)a0.s0, b0, c00);

841

c10 = fma((half8)a0.s1, b0, c10);

842

c20 = fma((half8)a0.s2, b0, c20);

843

c30 = fma((half8)a0.s3, b0, c30);

844

#endif // MULT_INTERLEAVE4X4_HEIGHT == 1

845

}

846

847

for(; i < (int)(COLS_MTX_B); ++i)

848

{

849

// Load values from matrix A (interleaved) and matrix B (transposed)

850

half4 a0 = vload4(0, src_addr_a);

851

half8 b0 = vload8(0, src_addr_b);

852

853

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

854

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

855

856

c00 = fma((half8)a0.s0, b0, c00);

857

c10 = fma((half8)a0.s1, b0, c10);

858

c20 = fma((half8)a0.s2, b0, c20);

859

c30 = fma((half8)a0.s3, b0, c30);

860

}

861

862

// Compute destination address

863

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

864

865

#if defined(ALPHA)

866

// Multiply by the weight of matrix product

867

c00 = c00 * (half8)ALPHA;

868

c10 = c10 * (half8)ALPHA;

869

c20 = c20 * (half8)ALPHA;

870

c30 = c30 * (half8)ALPHA;

871

#endif // defined(ALPHA)

872

873

// Compute dst address

874

__global uchar *dst_addr = offset(&dst, 0, 0);

875

876

// Add offset for batched GEMM

877

dst_addr += z * dst_stride_z;

878

879

// Store 4x8 block

880

vstore8(c00, 0, (__global half *)(dst_addr + 0 * dst_stride_y));

881

vstore8(c10, 0, (__global half *)(dst_addr + 1 * dst_stride_y));

882

vstore8(c20, 0, (__global half *)(dst_addr + 2 * dst_stride_y));

883

vstore8(c30, 0, (__global half *)(dst_addr + 3 * dst_stride_y));

884

}

Georgios Pinitas

8422558

2018-05-14 12:00:05 +0100

[diff] [blame]

885

886

// Undefine local defines

887

#undef COLS_MTX_B

888

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

889

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

890

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

891

#if defined(FIXED_POINT_POSITION)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

892

/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 8 bit fixed point precision

893

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication

894

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

895

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

896

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)

897

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

898

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

899

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

900

* @note:ALPHA must be passed in 8 bit fixed point format

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

901

*

902

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8

903

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

904

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

905

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

906

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

907

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

908

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

909

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

910

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

911

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

912

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

913

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

914

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

915

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

916

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

917

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

918

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

919

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

920

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

921

__kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),

922

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

923

IMAGE_DECLARATION(dst),

924

uint src0_stride_z,

925

uint src1_stride_z,

926

uint dst_stride_z)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

927

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

928

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

929

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

930

int z = get_global_id(2);

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

931

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

932

// Offset

933

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

934

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 16;

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

935

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

936

// src_addr_a = address of matrix A

937

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

938

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

939

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

940

941

#if defined(MATRIX_B_DEPTH)

942

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

943

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

944

#else // defined(MATRIX_B_DEPTH)

945

src1_addr_in_bytes += z * src1_stride_z;

946

#endif // defined(MATRIX_B_DEPTH)

947

948

__global char *src_addr_a = (__global char *)(src0_ptr + src0_addr_in_bytes);

949

__global char *src_addr_b = (__global char *)(src1_ptr + src1_addr_in_bytes);

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

950

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

951

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

952

__global char *src_end_addr_b = src_addr_b + COLS_B;

953

954

src_addr_a += offset_row_a;

955

src_addr_b += offset_row_b;

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

956

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

957

// Reset accumulators

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

short8 c00 = 0.0f;

short8 c10 = 0.0f;

short8 c20 = 0.0f;

short8 c30 = 0.0f;

short8 c01 = 0.0f;

short8 c11 = 0.0f;

short8 c21 = 0.0f;

short8 c31 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

967

// This for loop performs 1 accumulation for each iteration

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

968

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

969

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

970

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

971

char4 a0 = vload4(0, src_addr_a);

972

char16 b0 = vload16(0, src_addr_b);

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

973

974

c00 = mlal_sat_qs8x8(c00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);

975

c10 = mlal_sat_qs8x8(c10, (char8)a0.s1, b0.s01234567, FIXED_POINT_POSITION);

976

c20 = mlal_sat_qs8x8(c20, (char8)a0.s2, b0.s01234567, FIXED_POINT_POSITION);

977

c30 = mlal_sat_qs8x8(c30, (char8)a0.s3, b0.s01234567, FIXED_POINT_POSITION);

978

979

c01 = mlal_sat_qs8x8(c01, (char8)a0.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);

980

c11 = mlal_sat_qs8x8(c11, (char8)a0.s1, b0.s89ABCDEF, FIXED_POINT_POSITION);

981

c21 = mlal_sat_qs8x8(c21, (char8)a0.s2, b0.s89ABCDEF, FIXED_POINT_POSITION);

982

c31 = mlal_sat_qs8x8(c31, (char8)a0.s3, b0.s89ABCDEF, FIXED_POINT_POSITION);

983

}

984

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

985

// Compute destination address

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

986

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

987

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

988

// Multiply by the weight of matrix product

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

989

char16 c00_qs8 = convert_char16_sat((short16)(c00, c01));

990

char16 c10_qs8 = convert_char16_sat((short16)(c10, c11));

991

char16 c20_qs8 = convert_char16_sat((short16)(c20, c21));

992

char16 c30_qs8 = convert_char16_sat((short16)(c30, c31));

993

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

994

#if defined(ALPHA)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

995

c00_qs8 = mul_sat_qs8x16(c00_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

996

c10_qs8 = mul_sat_qs8x16(c10_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

997

c20_qs8 = mul_sat_qs8x16(c20_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

998

c30_qs8 = mul_sat_qs8x16(c30_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

999

#endif // defined(ALPHA)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1000

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1001

// Compute dst address

1002

__global uchar *dst_addr = offset(&dst, 0, 0);

1003

1004

// Add offset for batched GEMM

1005

dst_addr += z * dst_stride_z;

1006

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1007

// Store 16x4 block

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1008

vstore16(c00_qs8, 0, (__global char *)(dst_addr + 0 * dst_stride_y));

1009

vstore16(c10_qs8, 0, (__global char *)(dst_addr + 1 * dst_stride_y));

1010

vstore16(c20_qs8, 0, (__global char *)(dst_addr + 2 * dst_stride_y));

1011

vstore16(c30_qs8, 0, (__global char *)(dst_addr + 3 * dst_stride_y));

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1012

}

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1013

1014

/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 16 bit fixed point precision

1015

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication

1016

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1017

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

1018

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)

1019

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1020

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

1021

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

1022

* @note:ALPHA must be passed in 16 bit fixed point format

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1023

*

1024

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS16

1025

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1026

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1027

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1028

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1029

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1030

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

1031

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1032

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1033

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1034

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1035

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1036

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

1037

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

1038

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1039

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

1040

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1041

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1042

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1043

__kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),

1044

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1045

IMAGE_DECLARATION(dst),

1046

uint src0_stride_z,

1047

uint src1_stride_z,

1048

uint dst_stride_z)

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1049

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

1050

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

1051

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1052

int z = get_global_id(2);

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1053

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

1054

// Offset

1055

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

1056

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1057

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

1058

// src_addr_a = address of matrix A

1059

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1060

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

1061

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

1062

1063

#if defined(MATRIX_B_DEPTH)

1064

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1065

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

1066

#else // defined(MATRIX_B_DEPTH)

1067

src1_addr_in_bytes += z * src1_stride_z;

1068

#endif // defined(MATRIX_B_DEPTH)

1069

1070

__global short *src_addr_a = (__global short *)(src0_ptr + src0_addr_in_bytes);

1071

__global short *src_addr_b = (__global short *)(src1_ptr + src1_addr_in_bytes);

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1072

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1073

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

1074

__global short *src_end_addr_b = src_addr_b + COLS_B;

1075

1076

src_addr_a += offset_row_a;

1077

src_addr_b += offset_row_b;

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1078

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1079

// Reset accumulators

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

int8 c00 = 0.0f;

int8 c10 = 0.0f;

int8 c20 = 0.0f;

int8 c30 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1085

// This for loop performs 1 accumulation for each iteration

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

1086

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1087

{

1088

/* Load values from matrix A (interleaved) and matrix B (transposed) */

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

1089

short4 a0 = vload4(0, src_addr_a);

1090

short8 b0 = vload8(0, src_addr_b);

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1091

1092

c00 = mlal_sat_qs16x8(c00, (short8)a0.s0, b0, FIXED_POINT_POSITION);

1093

c10 = mlal_sat_qs16x8(c10, (short8)a0.s1, b0, FIXED_POINT_POSITION);

1094

c20 = mlal_sat_qs16x8(c20, (short8)a0.s2, b0, FIXED_POINT_POSITION);

1095

c30 = mlal_sat_qs16x8(c30, (short8)a0.s3, b0, FIXED_POINT_POSITION);

1096

}

1097

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1098

// Compute destination address

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1099

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1100

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1101

// Multiply by the weight of matrix product

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1102

short8 c00_qs16 = convert_short8_sat(c00);

1103

short8 c10_qs16 = convert_short8_sat(c10);

1104

short8 c20_qs16 = convert_short8_sat(c20);

1105

short8 c30_qs16 = convert_short8_sat(c30);

1106

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1107

#if defined(ALPHA)

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1108

c00_qs16 = mul_sat_qs16x8(c00_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

1109

c10_qs16 = mul_sat_qs16x8(c10_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

1110

c20_qs16 = mul_sat_qs16x8(c20_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

1111

c30_qs16 = mul_sat_qs16x8(c30_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1112

#endif // defined(ALPHA)

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1113

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1114

// Compute dst address

1115

__global uchar *dst_addr = offset(&dst, 0, 0);

1116

1117

// Add offset for batched GEMM

1118

dst_addr += z * dst_stride_z;

1119

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1120

// Store 8x4 block

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1121

vstore8(c00_qs16, 0, (__global short *)(dst_addr + 0 * dst_stride_y));

1122

vstore8(c10_qs16, 0, (__global short *)(dst_addr + 1 * dst_stride_y));

1123

vstore8(c20_qs16, 0, (__global short *)(dst_addr + 2 * dst_stride_y));

1124

vstore8(c30_qs16, 0, (__global short *)(dst_addr + 3 * dst_stride_y));

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1125

}

1126

#endif // defined(FIXED_POINT_POSITION)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

1127

#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1128

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1129

#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

1130

#if defined(DATA_TYPE)

1131

#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)

Michele Di Giorgio

f6f08da

2018-04-26 10:24:30 +0100

[diff] [blame]

1132

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1133

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1134

* @note This OpenCL kernel works with floating point data types (F16/F32)

1135

* @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

1136

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1137

* @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1138

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

1139

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1140

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1141

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1142

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1143

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1144

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1145

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1146

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1147

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1148

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1149

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1150

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1151

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1152

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1153

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1154

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1155

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1156

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1157

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1158

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1159

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1160

__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),

1161

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1162

IMAGE_DECLARATION(dst),

1163

uint src0_stride_z,

1164

uint src1_stride_z,

1165

uint dst_stride_z)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1166

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1167

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1168

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1169

// Compute starting address for matrix A and Matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1170

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1171

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1172

// Update address for the matrix A

1173

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1174

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1175

// Update address for the matrix B

1176

src_addr.s1 += idx * sizeof(DATA_TYPE);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1177

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1178

// Add offset for batched GEMM

1179

src_addr.s0 += get_global_id(2) * src0_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1180

1181

#if defined(MATRIX_B_DEPTH)

1182

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1183

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

1184

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1185

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1186

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1187

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1188

int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));

1189

1190

VECTOR_TYPE acc0 = 0.0f;

1191

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1192

VECTOR_TYPE acc1 = 0.0f;

1193

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1194

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1195

VECTOR_TYPE acc2 = 0.0f;

1196

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1197

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1198

VECTOR_TYPE acc3 = 0.0f;

1199

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1200

Georgios Pinitas

96880cf

2017-10-20 18:52:20 +0100

[diff] [blame]

1201

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1202

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1203

// Load values from matrix A

1204

VEC_DATA_TYPE(DATA_TYPE, 2)

1205

a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1206

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1207

VEC_DATA_TYPE(DATA_TYPE, 2)

1208

a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1209

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1210

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1211

VEC_DATA_TYPE(DATA_TYPE, 2)

1212

a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1213

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1214

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1215

VEC_DATA_TYPE(DATA_TYPE, 2)

1216

a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1217

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1218

// Load values from matrix B

1219

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

1220

VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1221

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1222

// Accumulate

1223

acc0 += b0 * (VECTOR_TYPE)a0.s0;

1224

acc0 += b1 * (VECTOR_TYPE)a0.s1;

1225

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1226

acc1 += b0 * (VECTOR_TYPE)a1.s0;

1227

acc1 += b1 * (VECTOR_TYPE)a1.s1;

1228

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1229

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1230

acc2 += b0 * (VECTOR_TYPE)a2.s0;

1231

acc2 += b1 * (VECTOR_TYPE)a2.s1;

1232

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1233

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1234

acc3 += b0 * (VECTOR_TYPE)a3.s0;

1235

acc3 += b1 * (VECTOR_TYPE)a3.s1;

1236

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1237

}

1238

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1239

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1240

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1241

// Load values from matrix A

1242

DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1243

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1244

DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1245

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1246

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1247

DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1248

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1249

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1250

DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1251

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1252

// Load values from matrix B

1253

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1254

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1255

// Accumulate

1256

acc0 += b0 * (VECTOR_TYPE)a0;

1257

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1258

acc1 += b0 * (VECTOR_TYPE)a1;

1259

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1260

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1261

acc2 += b0 * (VECTOR_TYPE)a2;

1262

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1263

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1264

acc3 += b0 * (VECTOR_TYPE)a3;

1265

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1266

}

1267

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1268

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1269

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1270

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1271

// Compute dst address

1272

__global uchar *dst_addr = offset(&dst, 0, 0);

1273

1274

// Add offset for batched GEMM

1275

dst_addr += get_global_id(2) * dst_stride_z;

1276

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1277

// Multiply by the weight of matrix-matrix product and store the result

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1278

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1279

acc0 = acc0 * (VECTOR_TYPE)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1280

#endif // defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1281

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1282

(acc0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1283

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1284

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1285

acc1 = acc1 * (VECTOR_TYPE)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1286

#endif // defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1287

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1288

(acc1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1289

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1290

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1291

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1292

acc2 = acc2 * (VECTOR_TYPE)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1293

#endif // defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1294

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1295

(acc2, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1296

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1297

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1298

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1299

acc3 = acc3 * (VECTOR_TYPE)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1300

#endif // defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1301

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1302

(acc3, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1303

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1304

}

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1305

#endif // defined(DATA_TYPE)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1306

Michele Di Giorgio

f6f08da

2018-04-26 10:24:30 +0100

[diff] [blame]

1307

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1308

*

1309

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

1310

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

1311

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

1312

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

1313

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1314

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

1315

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1316

*

1317

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32

1318

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1319

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1320

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1321

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1322

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1323

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

1324

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1325

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1326

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1327

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1328

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1329

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

1330

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1331

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1332

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1333

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1334

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1335

*/

1336

__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),

1337

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1338

IMAGE_DECLARATION(dst),

1339

uint src0_stride_z,

1340

uint src1_stride_z,

1341

uint dst_stride_z)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1342

{

1343

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

1344

1345

// Compute starting address for matrix A and matrix B

1346

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

1347

1348

// Update address for matrix A

1349

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

1350

1351

// Update address for matrix B

1352

src_addr.s1 += idx * sizeof(float);

1353

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1354

// Add offset for batched GEMM

1355

src_addr.s0 += get_global_id(2) * src0_stride_z;

1356

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1357

#if defined(MATRIX_B_DEPTH)

1358

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1359

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

1360

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1361

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1362

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1363

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1364

// Initialize accumulators

float acc00 = 0.0f;

float acc01 = 0.0f;

float acc02 = 0.0f;

float acc03 = 0.0f;

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

float acc10 = 0.0f;

float acc11 = 0.0f;

float acc12 = 0.0f;

float acc13 = 0.0f;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1376

1377

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

float acc20 = 0.0f;

float acc21 = 0.0f;

float acc22 = 0.0f;

float acc23 = 0.0f;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1383

1384

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

float acc30 = 0.0f;

float acc31 = 0.0f;

float acc32 = 0.0f;

float acc33 = 0.0f;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1390

1391

// A and B src indices get incremented at the same time.

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1392

int i = 0;

1393

for(; i <= ((int)COLS_A - 4); i += 4)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1394

{

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1395

// Load values from matrix A and matrix B

1396

float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1397

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1398

float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1399

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1400

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1401

float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1402

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1403

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1404

float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1405

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1406

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

1407

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1408

1409

// Multiply and accumulate

1410

acc00 = fma(a0.s0, b0.s0, acc00);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1411

acc01 = fma(a0.s0, b0.s1, acc01);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1412

acc02 = fma(a0.s0, b0.s2, acc02);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1413

acc03 = fma(a0.s0, b0.s3, acc03);

1414

1415

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1416

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1417

acc10 = fma(a1.s0, b0.s0, acc10);

1418

acc11 = fma(a1.s0, b0.s1, acc11);

1419

acc12 = fma(a1.s0, b0.s2, acc12);

1420

acc13 = fma(a1.s0, b0.s3, acc13);

1421

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1422

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1423

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1424

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1425

acc20 = fma(a2.s0, b0.s0, acc20);

1426

acc21 = fma(a2.s0, b0.s1, acc21);

1427

acc22 = fma(a2.s0, b0.s2, acc22);

1428

acc23 = fma(a2.s0, b0.s3, acc23);

1429

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1430

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1431

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1432

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1433

acc30 = fma(a3.s0, b0.s0, acc30);

1434

acc31 = fma(a3.s0, b0.s1, acc31);

1435

acc32 = fma(a3.s0, b0.s2, acc32);

1436

acc33 = fma(a3.s0, b0.s3, acc33);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1437

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1438

1439

// Load values from matrix A and matrix B

1440

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

1441

src_addr.s1 += src1_stride_y;

1442

1443

// Multiply and accumulate

1444

acc00 = fma(a0.s1, b0.s0, acc00);

1445

acc01 = fma(a0.s1, b0.s1, acc01);

1446

acc02 = fma(a0.s1, b0.s2, acc02);

1447

acc03 = fma(a0.s1, b0.s3, acc03);

1448

1449

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1450

1451

acc10 = fma(a1.s1, b0.s0, acc10);

1452

acc11 = fma(a1.s1, b0.s1, acc11);

1453

acc12 = fma(a1.s1, b0.s2, acc12);

1454

acc13 = fma(a1.s1, b0.s3, acc13);

1455

1456

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1457

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1458

1459

acc20 = fma(a2.s1, b0.s0, acc20);

1460

acc21 = fma(a2.s1, b0.s1, acc21);

1461

acc22 = fma(a2.s1, b0.s2, acc22);

1462

acc23 = fma(a2.s1, b0.s3, acc23);

1463

1464

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1465

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1466

1467

acc30 = fma(a3.s1, b0.s0, acc30);

1468

acc31 = fma(a3.s1, b0.s1, acc31);

1469

acc32 = fma(a3.s1, b0.s2, acc32);

1470

acc33 = fma(a3.s1, b0.s3, acc33);

1471

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1472

1473

// Load values from matrix A and matrix B

1474

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

1475

src_addr.s1 += src1_stride_y;

1476

1477

// Multiply and accumulate

1478

acc00 = fma(a0.s2, b0.s0, acc00);

1479

acc01 = fma(a0.s2, b0.s1, acc01);

1480

acc02 = fma(a0.s2, b0.s2, acc02);

1481

acc03 = fma(a0.s2, b0.s3, acc03);

1482

1483

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1484

1485

acc10 = fma(a1.s2, b0.s0, acc10);

1486

acc11 = fma(a1.s2, b0.s1, acc11);

1487

acc12 = fma(a1.s2, b0.s2, acc12);

1488

acc13 = fma(a1.s2, b0.s3, acc13);

1489

1490

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1491

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1492

1493

acc20 = fma(a2.s2, b0.s0, acc20);

1494

acc21 = fma(a2.s2, b0.s1, acc21);

1495

acc22 = fma(a2.s2, b0.s2, acc22);

1496

acc23 = fma(a2.s2, b0.s3, acc23);

1497

1498

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1499

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1500

1501

acc30 = fma(a3.s2, b0.s0, acc30);

1502

acc31 = fma(a3.s2, b0.s1, acc31);

1503

acc32 = fma(a3.s2, b0.s2, acc32);

1504

acc33 = fma(a3.s2, b0.s3, acc33);

1505

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1506

1507

// Load values from matrix A and matrix B

1508

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

1509

src_addr.s1 += src1_stride_y;

1510

1511

// Multiply and accumulate

1512

acc00 = fma(a0.s3, b0.s0, acc00);

1513

acc01 = fma(a0.s3, b0.s1, acc01);

1514

acc02 = fma(a0.s3, b0.s2, acc02);

1515

acc03 = fma(a0.s3, b0.s3, acc03);

1516

1517

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1518

1519

acc10 = fma(a1.s3, b0.s0, acc10);

1520

acc11 = fma(a1.s3, b0.s1, acc11);

1521

acc12 = fma(a1.s3, b0.s2, acc12);

1522

acc13 = fma(a1.s3, b0.s3, acc13);

1523

1524

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1525

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1526

1527

acc20 = fma(a2.s3, b0.s0, acc20);

1528

acc21 = fma(a2.s3, b0.s1, acc21);

1529

acc22 = fma(a2.s3, b0.s2, acc22);

1530

acc23 = fma(a2.s3, b0.s3, acc23);

1531

1532

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1533

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1534

1535

acc30 = fma(a3.s3, b0.s0, acc30);

1536

acc31 = fma(a3.s3, b0.s1, acc31);

1537

acc32 = fma(a3.s3, b0.s2, acc32);

1538

acc33 = fma(a3.s3, b0.s3, acc33);

1539

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1540

1541

src_addr.s0 += 4 * sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1542

}

1543

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1544

for(; i < (int)COLS_A; ++i)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1545

{

1546

// Load values from matrix A

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1547

float a0 = *((__global float *)(src0_ptr + src_addr.s0));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1548

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1549

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1550

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1551

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1552

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1553

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1554

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1555

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1556

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1557

// Load values from matrix B

1558

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1559

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1560

1561

// Multiply and accumulate

1562

acc00 = fma(a0, b0.s0, acc00);

1563

acc01 = fma(a0, b0.s1, acc01);

1564

acc02 = fma(a0, b0.s2, acc02);

1565

acc03 = fma(a0, b0.s3, acc03);

1566

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1567

acc10 = fma(a1, b0.s0, acc10);

1568

acc11 = fma(a1, b0.s1, acc11);

1569

acc12 = fma(a1, b0.s2, acc12);

1570

acc13 = fma(a1, b0.s3, acc13);

1571

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1572

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1573

acc20 = fma(a2, b0.s0, acc20);

1574

acc21 = fma(a2, b0.s1, acc21);

1575

acc22 = fma(a2, b0.s2, acc22);

1576

acc23 = fma(a2, b0.s3, acc23);

1577

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1578

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1579

acc30 = fma(a3, b0.s0, acc30);

1580

acc31 = fma(a3, b0.s1, acc31);

1581

acc32 = fma(a3, b0.s2, acc32);

1582

acc33 = fma(a3, b0.s3, acc33);

1583

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1584

1585

src_addr.s0 += sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1586

}

1587

1588

// Compute destination address

1589

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1590

1591

// Multiply by the weight of matrix-matrix product and store the result

1592

#if defined(ALPHA)

1593

acc00 = acc00 * ALPHA;

1594

acc01 = acc01 * ALPHA;

1595

acc02 = acc02 * ALPHA;

1596

acc03 = acc03 * ALPHA;

1597

#endif // defined(ALPHA)

1598

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1599

// Compute dst address

1600

__global uchar *dst_addr = offset(&dst, 0, 0);

1601

1602

// Add offset for batched GEMM

1603

dst_addr += get_global_id(2) * dst_stride_z;

1604

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1605

float4 acc0 = ((float4)(acc00, acc01, acc02, acc03));

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1606

vstore4(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1607

1608

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1609

#if defined(ALPHA)

1610

acc10 = acc10 * ALPHA;

1611

acc11 = acc11 * ALPHA;

1612

acc12 = acc12 * ALPHA;

1613

acc13 = acc13 * ALPHA;

1614

#endif // defined(ALPHA)

1615

float4 acc1 = ((float4)(acc10, acc11, acc12, acc13));

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1616

vstore4(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1617

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1618

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1619

#if defined(ALPHA)

1620

acc20 = acc20 * ALPHA;

1621

acc21 = acc21 * ALPHA;

1622

acc22 = acc22 * ALPHA;

1623

acc23 = acc23 * ALPHA;

1624

#endif // defined(ALPHA)

1625

float4 acc2 = ((float4)(acc20, acc21, acc22, acc23));

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1626

vstore4(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1627

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1628

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1629

#if defined(ALPHA)

1630

acc30 = acc30 * ALPHA;

1631

acc31 = acc31 * ALPHA;

1632

acc32 = acc32 * ALPHA;

1633

acc33 = acc33 * ALPHA;

1634

#endif // defined(ALPHA)

1635

float4 acc3 = ((float4)(acc30, acc31, acc32, acc33));

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1636

vstore4(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1637

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1638

}

1639

1640

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

1641

*

1642

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

1643

* This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.

1644

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

1645

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.

1646

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

1647

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1648

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

1649

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1650

*

1651

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32

1652

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1653

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1654

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1655

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1656

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1657

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

1658

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1659

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1660

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1661

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1662

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1663

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

1664

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1665

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1666

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1667

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1668

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1669

*/

1670

__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),

1671

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1672

IMAGE_DECLARATION(dst),

1673

uint src0_stride_z,

1674

uint src1_stride_z,

1675

uint dst_stride_z)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1676

{

1677

// Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1678

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

1679

1680

// Compute starting address for matrix A and Matrix B

1681

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

1682

1683

// Update address for the matrix A

1684

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

1685

1686

// Update address for the matrix B

1687

src_addr.s1 += idx * sizeof(float);

1688

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1689

// Add offset for batched GEMM

1690

src_addr.s0 += get_global_id(2) * src0_stride_z;

1691

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1692

#if defined(MATRIX_B_DEPTH)

1693

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1694

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

1695

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1696

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

1697

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1698

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1699

// Initialize accumulators

float acc00 = 0.0f;

float acc01 = 0.0f;

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1704

float acc10 = 0.0f;

1705

float acc11 = 0.0f;

1706

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1707

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1708

float acc20 = 0.0f;

1709

float acc21 = 0.0f;

1710

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1711

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1712

float acc30 = 0.0f;

1713

float acc31 = 0.0f;

1714

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1715

1716

// A and B src indices get incremented at the same time.

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1717

int i = 0;

1718

for(; i <= ((int)COLS_A - 8); i += 8)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1719

{

1720

// Load values from matrix A

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1721

float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1722

1723

// Load values from matrix B

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1724

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

1725

src_addr.s1 += src1_stride_y;

1726

float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

1727

src_addr.s1 += src1_stride_y;

1728

float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

1729

src_addr.s1 += src1_stride_y;

1730

float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

1731

src_addr.s1 += src1_stride_y;

1732

float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

1733

src_addr.s1 += src1_stride_y;

1734

float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

1735

src_addr.s1 += src1_stride_y;

1736

float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

1737

src_addr.s1 += src1_stride_y;

1738

float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

1739

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1740

1741

// Multiply and accumulate

1742

acc00 = fma(a0.s0, b0.s0, acc00);

1743

acc00 = fma(a0.s1, b1.s0, acc00);

1744

acc00 = fma(a0.s2, b2.s0, acc00);

1745

acc00 = fma(a0.s3, b3.s0, acc00);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1746

acc00 = fma(a0.s4, b4.s0, acc00);

1747

acc00 = fma(a0.s5, b5.s0, acc00);

1748

acc00 = fma(a0.s6, b6.s0, acc00);

1749

acc00 = fma(a0.s7, b7.s0, acc00);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1750

1751

acc01 = fma(a0.s0, b0.s1, acc01);

1752

acc01 = fma(a0.s1, b1.s1, acc01);

1753

acc01 = fma(a0.s2, b2.s1, acc01);

1754

acc01 = fma(a0.s3, b3.s1, acc01);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1755

acc01 = fma(a0.s4, b4.s1, acc01);

1756

acc01 = fma(a0.s5, b5.s1, acc01);

1757

acc01 = fma(a0.s6, b6.s1, acc01);

1758

acc01 = fma(a0.s7, b7.s1, acc01);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1759

1760

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1761

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1762

acc10 = fma(a0.s0, b0.s0, acc10);

1763

acc10 = fma(a0.s1, b1.s0, acc10);

1764

acc10 = fma(a0.s2, b2.s0, acc10);

1765

acc10 = fma(a0.s3, b3.s0, acc10);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1766

acc10 = fma(a0.s4, b4.s0, acc10);

1767

acc10 = fma(a0.s5, b5.s0, acc10);

1768

acc10 = fma(a0.s6, b6.s0, acc10);

1769

acc10 = fma(a0.s7, b7.s0, acc10);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1770

1771

acc11 = fma(a0.s0, b0.s1, acc11);

1772

acc11 = fma(a0.s1, b1.s1, acc11);

1773

acc11 = fma(a0.s2, b2.s1, acc11);

1774

acc11 = fma(a0.s3, b3.s1, acc11);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1775

acc11 = fma(a0.s4, b4.s1, acc11);

1776

acc11 = fma(a0.s5, b5.s1, acc11);

1777

acc11 = fma(a0.s6, b6.s1, acc11);

1778

acc11 = fma(a0.s7, b7.s1, acc11);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1779

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1780

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1781

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1782

acc20 = fma(a0.s0, b0.s0, acc20);

1783

acc20 = fma(a0.s1, b1.s0, acc20);

1784

acc20 = fma(a0.s2, b2.s0, acc20);

1785

acc20 = fma(a0.s3, b3.s0, acc20);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1786

acc20 = fma(a0.s4, b4.s0, acc20);

1787

acc20 = fma(a0.s5, b5.s0, acc20);

1788

acc20 = fma(a0.s6, b6.s0, acc20);

1789

acc20 = fma(a0.s7, b7.s0, acc20);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1790

1791

acc21 = fma(a0.s0, b0.s1, acc21);

1792

acc21 = fma(a0.s1, b1.s1, acc21);

1793

acc21 = fma(a0.s2, b2.s1, acc21);

1794

acc21 = fma(a0.s3, b3.s1, acc21);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1795

acc21 = fma(a0.s4, b4.s1, acc21);

1796

acc21 = fma(a0.s5, b5.s1, acc21);

1797

acc21 = fma(a0.s6, b6.s1, acc21);

1798

acc21 = fma(a0.s7, b7.s1, acc21);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1799

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1800

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1801

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1802

acc30 = fma(a0.s0, b0.s0, acc30);

1803

acc30 = fma(a0.s1, b1.s0, acc30);

1804

acc30 = fma(a0.s2, b2.s0, acc30);

1805

acc30 = fma(a0.s3, b3.s0, acc30);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1806

acc30 = fma(a0.s4, b4.s0, acc30);

1807

acc30 = fma(a0.s5, b5.s0, acc30);

1808

acc30 = fma(a0.s6, b6.s0, acc30);

1809

acc30 = fma(a0.s7, b7.s0, acc30);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1810

1811

acc31 = fma(a0.s0, b0.s1, acc31);

1812

acc31 = fma(a0.s1, b1.s1, acc31);

1813

acc31 = fma(a0.s2, b2.s1, acc31);

1814

acc31 = fma(a0.s3, b3.s1, acc31);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1815

acc31 = fma(a0.s4, b4.s1, acc31);

1816

acc31 = fma(a0.s5, b5.s1, acc31);

1817

acc31 = fma(a0.s6, b6.s1, acc31);

1818

acc31 = fma(a0.s7, b7.s1, acc31);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1819

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1820

1821

src_addr.s0 += sizeof(float) * 8;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1822

}

1823

// float size increment

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1824

for(; i < (int)COLS_A; ++i)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1825

{

1826

// Load values from matrix A

1827

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1828

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1829

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1830

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1831

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1832

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1833

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1834

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1835

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1836

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1837

// Load values from matrix B

1838

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1839

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1840

1841

// Multiply and accumulate

1842

acc00 = fma(a0, b0.s0, acc00);

1843

acc01 = fma(a0, b0.s1, acc01);

1844

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1845

acc10 = fma(a1, b0.s0, acc10);

1846

acc11 = fma(a1, b0.s1, acc11);

1847

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1848

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1849

acc20 = fma(a2, b0.s0, acc20);

1850

acc21 = fma(a2, b0.s1, acc21);

1851

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1852

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1853

acc30 = fma(a3, b0.s0, acc30);

1854

acc31 = fma(a3, b0.s1, acc31);

1855

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

1856

1857

src_addr.s0 += sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1858

}

1859

1860

// Compute destination address

1861

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1862

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1863

// Compute dst address

1864

__global uchar *dst_addr = offset(&dst, 0, 0);

1865

1866

// Add offset for batched GEMM

1867

dst_addr += get_global_id(2) * dst_stride_z;

1868

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1869

// Multiply by the weight of matrix-matrix product and store the result

1870

#if defined(ALPHA)

1871

acc00 = acc00 * ALPHA;

1872

acc01 = acc01 * ALPHA;

1873

#endif // defined(ALPHA)

1874

float2 acc0 = ((float2)(acc00, acc01));

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1875

vstore2(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1876

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1877

#if defined(ALPHA)

1878

acc10 = acc10 * ALPHA;

1879

acc11 = acc11 * ALPHA;

1880

#endif // defined(ALPHA)

1881

float2 acc1 = ((float2)(acc10, acc11));

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1882

vstore2(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1883

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1884

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1885

#if defined(ALPHA)

1886

acc20 = acc20 * ALPHA;

1887

acc21 = acc21 * ALPHA;

1888

#endif // defined(ALPHA)

1889

float2 acc2 = ((float2)(acc20, acc21));

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1890

vstore2(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1891

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1892

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1893

#if defined(ALPHA)

1894

acc30 = acc30 * ALPHA;

1895

acc31 = acc31 * ALPHA;

1896

#endif // defined(ALPHA)

1897

float2 acc3 = (float2)(acc30, acc31);

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

1898

vstore2(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1899

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1900

}

1901

Vidhya Sudhan Loganathan

bdff491

2018-05-22 15:03:09 +0100

[diff] [blame]

1902

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

fd68311

2018-04-17 09:52:44 +0100

[diff] [blame]

1903

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

1904

*

1905

* @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.

1906

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

1907

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

1908

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

1909

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

1910

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

1911

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

1912

*

1913

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

1914

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1915

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1916

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1917

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1918

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1919

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

1920

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1921

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1922

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1923

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1924

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1925

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

1926

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1927

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1928

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1929

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1930

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1931

*/

1932

__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),

1933

IMAGE_DECLARATION(src1),

1934

IMAGE_DECLARATION(dst),

uint src0_stride_z,

uint src1_stride_z,

uint dst_stride_z)

{

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

1940

1941

// Compute starting address for matrix A and Matrix B

1942

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

1943

1944

// Update address for the matrix A

1945

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

1946

1947

// Update address for the matrix B

1948

src_addr.s1 += idx * sizeof(half);

1949

1950

// Add offset for batched GEMM

1951

src_addr.s0 += get_global_id(2) * src0_stride_z;

1952

1953

#if defined(MATRIX_B_DEPTH)

1954

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1955

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

1956

#else // defined(MATRIX_B_DEPTH)

1957

src_addr.s1 += get_global_id(2) * src1_stride_z;

1958

#endif // defined(MATRIX_B_DEPTH)

1959

1960

half8 acc0 = 0.0h;

1961

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1962

half8 acc1 = 0.0h;

1963

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1964

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1965

half8 acc2 = 0.0h;

1966

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1967

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1968

half8 acc3 = 0.0h;

1969

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1970

1971

int i = 0;

1972

for(; i <= ((int)COLS_A - 4); i += 4)

1973

{

1974

// Load values from matrix A

1975

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1976

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1977

half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1978

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1979

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1980

half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1981

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1982

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1983

half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1984

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1985

// Load values from matrix B

1986

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

1987

src_addr.s1 += src1_stride_y;

1988

1989

// Accumulate

1990

acc0 = fma(b0, (half8)a0.s0, acc0);

1991

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1992

acc1 = fma(b0, (half8)a1.s0, acc1);

1993

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1994

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1995

acc2 = fma(b0, (half8)a2.s0, acc2);

1996

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1997

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1998

acc3 = fma(b0, (half8)a3.s0, acc3);

1999

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2000

2001

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

2002

src_addr.s1 += src1_stride_y;

2003

acc0 = fma(b0, (half8)a0.s1, acc0);

2004

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2005

acc1 = fma(b0, (half8)a1.s1, acc1);

2006

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2007

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2008

acc2 = fma(b0, (half8)a2.s1, acc2);

2009

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2010

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2011

acc3 = fma(b0, (half8)a3.s1, acc3);

2012

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2013

2014

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

2015

src_addr.s1 += src1_stride_y;

2016

acc0 = fma(b0, (half8)a0.s2, acc0);

2017

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2018

acc1 = fma(b0, (half8)a1.s2, acc1);

2019

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2020

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2021

acc2 = fma(b0, (half8)a2.s2, acc2);

2022

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2023

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2024

acc3 = fma(b0, (half8)a3.s2, acc3);

2025

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2026

2027

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

2028

src_addr.s1 += src1_stride_y;

2029

acc0 = fma(b0, (half8)a0.s3, acc0);

2030

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2031

acc1 = fma(b0, (half8)a1.s3, acc1);

2032

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2033

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2034

acc2 = fma(b0, (half8)a2.s3, acc2);

2035

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2036

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2037

acc3 = fma(b0, (half8)a3.s3, acc3);

2038

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2039

2040

src_addr.s0 += 4 * sizeof(half);

2041

}

2042

2043

for(; i < (int)COLS_A; ++i)

2044

{

2045

// Load values from matrix A

2046

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

2047

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2048

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

2049

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2050

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2051

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

2052

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2053

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2054

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

2055

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2056

// Load values from matrix B

2057

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

2058

2059

src_addr += (int2)(sizeof(half), src1_stride_y);

2060

2061

// Accumulate

2062

acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;

2063

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2064

acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;

2065

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2066

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2067

acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;

2068

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2069

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2070

acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;

2071

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2072

}

2073

2074

// Compute destination address

2075

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2076

2077

// Compute dst address

2078

__global uchar *dst_addr = offset(&dst, 0, 0);

2079

2080

// Add offset for batched GEMM

2081

dst_addr += get_global_id(2) * dst_stride_z;

2082

2083

// Multiply by the weight of matrix-matrix product and store the result

2084

#if defined(ALPHA)

2085

acc0 = acc0 * (half8)ALPHA;

2086

#endif // defined(ALPHA)

2087

vstore8(acc0, 0, (__global half *)(dst_addr + 0 * dst_stride_y));

2088

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2089

#if defined(ALPHA)

2090

acc1 = acc1 * (half8)ALPHA;

2091

#endif // defined(ALPHA)

2092

vstore8(acc1, 0, (__global half *)(dst_addr + 1 * dst_stride_y));

2093

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2094

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2095

#if defined(ALPHA)

2096

acc2 = acc2 * (half8)ALPHA;

2097

#endif // defined(ALPHA)

2098

vstore8(acc2, 0, (__global half *)(dst_addr + 2 * dst_stride_y));

2099

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2100

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2101

#if defined(ALPHA)

2102

acc3 = acc3 * (half8)ALPHA;

2103

#endif // defined(ALPHA)

2104

vstore8(acc3, 0, (__global half *)(dst_addr + 3 * dst_stride_y));

2105

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2106

}

Vidhya Sudhan Loganathan

bdff491

2018-05-22 15:03:09 +0100

[diff] [blame]

2107

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

fd68311

2018-04-17 09:52:44 +0100

[diff] [blame]

2108

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2109

#if defined(FIXED_POINT_POSITION)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2110

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2111

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2112

* @note This OpenCL kernel works with fixed point data types QS8

2113

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2114

* @note The number matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2115

* @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2116

* @note The optional alpha value must be passed in 8 bit fixed point format using -DALPHA

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

2117

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

2118

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2119

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2120

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8/QS16

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2121

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

2122

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2123

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

2124

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2125

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

2126

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

2127

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

2128

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2129

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

2130

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2131

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

2132

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

2133

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2134

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2135

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2136

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2137

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2138

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2139

__kernel void gemm_mm_qs8(IMAGE_DECLARATION(src0),

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2140

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2141

IMAGE_DECLARATION(dst),

2142

uint src0_stride_z,

2143

uint src1_stride_z,

2144

uint dst_stride_z)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2145

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2146

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2147

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2148

// Compute starting address for matrix A and Matrix B

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2149

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2150

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2151

// Update address for the matrix A

2152

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2153

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2154

// Update address for the matrix B

2155

src_addr.s1 += idx * sizeof(char);

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2156

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2157

// Add offset for batched GEMM

2158

src_addr.s0 += get_global_id(2) * src0_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

2159

2160

#if defined(MATRIX_B_DEPTH)

2161

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2162

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

2163

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2164

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

2165

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2166

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2167

int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(char));

short8 acc00 = 0;

short8 acc01 = 0;

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2172

short8 acc10 = 0;

2173

short8 acc11 = 0;

2174

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2175

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2176

short8 acc20 = 0;

2177

short8 acc21 = 0;

2178

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2179

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2180

short8 acc30 = 0;

2181

short8 acc31 = 0;

2182

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2183

2184

// This for loop performs 4 accumulations per iteration

2185

for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2186

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2187

char2 a0 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

2188

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2189

char2 a1 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

2190

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2191

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2192

char2 a2 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

2193

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2194

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2195

char2 a3 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

2196

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2197

char16 b0 = vload16(0, (__global char *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));

2198

char16 b1 = vload16(0, (__global char *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2199

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2200

acc00 = mlal_sat_qs8x8(acc00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);

2201

acc00 = mlal_sat_qs8x8(acc00, (char8)a0.s1, b1.s01234567, FIXED_POINT_POSITION);

2202

acc01 = mlal_sat_qs8x8(acc01, (char8)a0.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);

2203

acc01 = mlal_sat_qs8x8(acc01, (char8)a0.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);

2204

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2205

acc10 = mlal_sat_qs8x8(acc10, (char8)a1.s0, b0.s01234567, FIXED_POINT_POSITION);

2206

acc10 = mlal_sat_qs8x8(acc10, (char8)a1.s1, b1.s01234567, FIXED_POINT_POSITION);

2207

acc11 = mlal_sat_qs8x8(acc11, (char8)a1.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);

2208

acc11 = mlal_sat_qs8x8(acc11, (char8)a1.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);

2209

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2210

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2211

acc20 = mlal_sat_qs8x8(acc20, (char8)a2.s0, b0.s01234567, FIXED_POINT_POSITION);

2212

acc20 = mlal_sat_qs8x8(acc20, (char8)a2.s1, b1.s01234567, FIXED_POINT_POSITION);

2213

acc21 = mlal_sat_qs8x8(acc21, (char8)a2.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);

2214

acc21 = mlal_sat_qs8x8(acc21, (char8)a2.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);

2215

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2216

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2217

acc30 = mlal_sat_qs8x8(acc30, (char8)a3.s0, b0.s01234567, FIXED_POINT_POSITION);

2218

acc30 = mlal_sat_qs8x8(acc30, (char8)a3.s1, b1.s01234567, FIXED_POINT_POSITION);

2219

acc31 = mlal_sat_qs8x8(acc31, (char8)a3.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);

2220

acc31 = mlal_sat_qs8x8(acc31, (char8)a3.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);

2221

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2222

}

2223

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2224

// Left-over accumulations

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2225

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))

2226

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2227

char a0 = *((__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

2228

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2229

char a1 = *((__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

2230

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2231

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2232

char a2 = *((__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

2233

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2234

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2235

char a3 = *((__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

2236

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2237

char16 b0 = vload16(0, (__global char *)(src1_ptr + src_addr.s1));

2238

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2239

acc00 = mlal_sat_qs8x8(acc00, (char8)a0, b0.s01234567, FIXED_POINT_POSITION);

2240

acc01 = mlal_sat_qs8x8(acc01, (char8)a0, b0.s89ABCDEF, FIXED_POINT_POSITION);

2241

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2242

acc10 = mlal_sat_qs8x8(acc10, (char8)a1, b0.s01234567, FIXED_POINT_POSITION);

2243

acc11 = mlal_sat_qs8x8(acc11, (char8)a1, b0.s89ABCDEF, FIXED_POINT_POSITION);

2244

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2245

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2246

acc20 = mlal_sat_qs8x8(acc20, (char8)a2, b0.s01234567, FIXED_POINT_POSITION);

2247

acc21 = mlal_sat_qs8x8(acc21, (char8)a2, b0.s89ABCDEF, FIXED_POINT_POSITION);

2248

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2249

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2250

acc30 = mlal_sat_qs8x8(acc30, (char8)a3, b0.s01234567, FIXED_POINT_POSITION);

2251

acc31 = mlal_sat_qs8x8(acc31, (char8)a3, b0.s89ABCDEF, FIXED_POINT_POSITION);

2252

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2253

}

2254

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2255

// Compute destination address

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2256

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2257

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2258

// Compute dst address

2259

__global uchar *dst_addr = offset(&dst, 0, 0);

2260

2261

// Add offset for batched GEMM

2262

dst_addr += get_global_id(2) * dst_stride_z;

2263

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2264

// Multiply by the weight of matrix product and store the result

2265

char16 acc_qs8;

2266

acc_qs8 = convert_char16_sat((short16)(acc00, acc01));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2267

#if defined(ALPHA)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2268

acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2269

#endif // defined(ALPHA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2270

vstore16(acc_qs8, 0, (__global char *)(dst_addr + 0 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2271

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2272

acc_qs8 = convert_char16_sat((short16)(acc10, acc11));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2273

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2274

acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2275

#endif // defined(ALPHA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2276

vstore16(acc_qs8, 0, (__global char *)(dst_addr + 1 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2277

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2278

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2279

acc_qs8 = convert_char16_sat((short16)(acc20, acc21));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2280

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2281

acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2282

#endif // defined(ALPHA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2283

vstore16(acc_qs8, 0, (__global char *)(dst_addr + 2 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2284

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2285

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2286

acc_qs8 = convert_char16_sat((short16)(acc30, acc31));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2287

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2288

acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2289

#endif // defined(ALPHA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2290

vstore16(acc_qs8, 0, (__global char *)(dst_addr + 3 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2291

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2292

}

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2293

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2294

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2295

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2296

* @note This OpenCL kernel works with fixed point data types QS16

2297

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2298

* @note The number of matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2299

* @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2300

* @note The optional alpha value must be passed in 16 bit fixed point format using -DALPHA

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

2301

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)

2302

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2303

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2304

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8/QS16

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2305

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

2306

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2307

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

2308

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2309

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

2310

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

2311

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

2312

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2313

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

2314

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2315

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

2316

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

2317

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2318

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2319

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2320

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2321

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2322

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2323

__kernel void gemm_mm_qs16(IMAGE_DECLARATION(src0),

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2324

IMAGE_DECLARATION(src1),

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2325

IMAGE_DECLARATION(dst),

2326

uint src0_stride_z,

2327

uint src1_stride_z,

2328

uint dst_stride_z)

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2329

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2330

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2331

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2332

// Compute starting address for matrix A and Matrix B

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2333

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2334

2335

// Update address for the matrix A

2336

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

2337

2338

// Update address for the matrix B

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2339

src_addr.s1 += idx * sizeof(short);

2340

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2341

// Add offset for batched GEMM

2342

src_addr.s0 += get_global_id(2) * src0_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

2343

2344

#if defined(MATRIX_B_DEPTH)

2345

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2346

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

2347

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2348

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

2349

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2350

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2351

int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(short));

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2352

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2353

int8 acc0 = 0;

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2354

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2355

int8 acc1 = 0;

2356

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2357

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2358

int8 acc2 = 0;

2359

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2360

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2361

int8 acc3 = 0;

2362

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2363

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2364

// This for loop performs 4 accumulations per iteration

Georgios Pinitas

96880cf

2017-10-20 18:52:20 +0100

[diff] [blame]

2365

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(short)); src_addr += (int2)(2 * sizeof(short), 2 * src1_stride_y))

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2366

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2367

short2 a0 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

2368

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2369

short2 a1 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

2370

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2371

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2372

short2 a2 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

2373

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2374

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2375

short2 a3 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

2376

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2377

short8 b0 = vload8(0, (__global short *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));

2378

short8 b1 = vload8(0, (__global short *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2379

2380

acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s0, b0, FIXED_POINT_POSITION);

2381

acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s1, b1, FIXED_POINT_POSITION);

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2382

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2383

acc1 = mlal_sat_qs16x8(acc1, (short8)a1.s0, b0, FIXED_POINT_POSITION);

2384

acc1 = mlal_sat_qs16x8(acc1, (short8)a1.s1, b1, FIXED_POINT_POSITION);

2385

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2386

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2387

acc2 = mlal_sat_qs16x8(acc2, (short8)a2.s0, b0, FIXED_POINT_POSITION);

2388

acc2 = mlal_sat_qs16x8(acc2, (short8)a2.s1, b1, FIXED_POINT_POSITION);

2389

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2390

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2391

acc3 = mlal_sat_qs16x8(acc3, (short8)a3.s0, b0, FIXED_POINT_POSITION);

2392

acc3 = mlal_sat_qs16x8(acc3, (short8)a3.s1, b1, FIXED_POINT_POSITION);

2393

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2394

}

2395

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2396

// Left-over accumulations

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2397

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(short), src1_stride_y))

2398

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2399

short a0 = *((__global short *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

2400

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2401

short a1 = *((__global short *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

2402

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2403

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2404

short a2 = *((__global short *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

2405

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2406

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2407

short a3 = *((__global short *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

2408

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2409

short8 b0 = vload8(0, (__global short *)(src1_ptr + src_addr.s1));

2410

2411

acc0 = mlal_sat_qs16x8(acc0, (short8)a0, b0, FIXED_POINT_POSITION);

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2412

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2413

acc1 = mlal_sat_qs16x8(acc1, (short8)a1, b0, FIXED_POINT_POSITION);

2414

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2415

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2416

acc2 = mlal_sat_qs16x8(acc2, (short8)a2, b0, FIXED_POINT_POSITION);

2417

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2418

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2419

acc3 = mlal_sat_qs16x8(acc3, (short8)a3, b0, FIXED_POINT_POSITION);

2420

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2421

}

2422

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2423

// Compute destination address

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2424

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2425

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2426

// Compute dst address

2427

__global uchar *dst_addr = offset(&dst, 0, 0);

2428

Gian Marco Iodice

81b28c4

2018-03-29 10:29:36 +0100

[diff] [blame]

2429

// Add offset for batched GEMM

2430

dst_addr += get_global_id(2) * dst_stride_z;

2431

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2432

// Multiply by the weight of matrix product and store the result

2433

short8 acc_qs16;

2434

acc_qs16 = convert_short8_sat(acc0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2435

#if defined(ALPHA)

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2436

acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2437

#endif // defined(ALPHA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2438

vstore8(acc_qs16, 0, (__global short *)(dst_addr + 0 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2439

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2440

acc_qs16 = convert_short8_sat(acc1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2441

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2442

acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2443

#endif // defined(ALPHA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2444

vstore8(acc_qs16, 0, (__global short *)(dst_addr + 1 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2445

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

2446

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2447

acc_qs16 = convert_short8_sat(acc2);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2448

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2449

acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2450

#endif // defined(ALPHA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2451

vstore8(acc_qs16, 0, (__global short *)(dst_addr + 2 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2452

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

2453

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

2454

acc_qs16 = convert_short8_sat(acc3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2455

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2456

acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2457

#endif // defined(ALPHA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2458

vstore8(acc_qs16, 0, (__global short *)(dst_addr + 3 * dst_stride_y));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2459

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2460

}

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2461

#endif // defined(FIXED_POINT_POSITION)

2462

#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2463

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2464

#if defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2465

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

2466

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

2467

* @note The beta's value need to be passed at compile time using -DBETA

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2468

*

2469

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F32

2470

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

2471

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2472

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

2473

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2474

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2475

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2476

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2477

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2478

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2479

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2480

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2481

*/

2482

__kernel void gemm_ma_f32(IMAGE_DECLARATION(src),

2483

IMAGE_DECLARATION(dst))

2484

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2485

// Compute source and destination addresses

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2486

Image src = CONVERT_TO_IMAGE_STRUCT(src);

2487

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2488

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2489

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2490

float4 alpha_ab = vload4(0, (__global float *)dst.ptr);

2491

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2492

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2493

float4 c = vload4(0, (__global float *)src.ptr);

2494

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2495

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2496

float4 out = alpha_ab + (float4)BETA * c;

2497

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2498

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2499

vstore4(out, 0, (__global float *)dst.ptr);

2500

}

2501

2502

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

2503

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

2504

* @note The beta's value need to be passed at compile time using -DBETA

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2505

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2506

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F16

2507

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

2508

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2509

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

2510

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2511

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2512

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2513

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2514

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2515

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2516

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2517

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2518

*/

2519

__kernel void gemm_ma_f16(IMAGE_DECLARATION(src),

2520

IMAGE_DECLARATION(dst))

2521

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2522

// Compute source and destination addresses

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2523

Image src = CONVERT_TO_IMAGE_STRUCT(src);

2524

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2525

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2526

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2527

half8 alpha_ab = vload8(0, (__global half *)dst.ptr);

2528

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2529

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2530

half8 c = vload8(0, (__global half *)src.ptr);

2531

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2532

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2533

half8 out = alpha_ab + (half8)BETA * c;

2534

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2535

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2536

vstore8(out, 0, (__global half *)dst.ptr);

2537

}

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2538

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2539

#if defined(FIXED_POINT_POSITION)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2540

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices in 8 bit fixed point taking into account that the second matrix might be weighted by a scalar value beta:

2541

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

2542

* @note The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2543

*

2544

* @note: BETA must be passed in 8 bit fixed point format

2545

*

2546

* @param[in] src_ptr Pointer to the source matrix. Supported data types: QS8

2547

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

2548

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2549

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

2550

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2551

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

2552

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

2553

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2554

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2555

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2556

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2557

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2558

*/

2559

__kernel void gemm_ma_qs8(IMAGE_DECLARATION(src),

2560

IMAGE_DECLARATION(dst))

2561

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2562

// Compute source and destination addresses

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2563

Image src = CONVERT_TO_IMAGE_STRUCT(src);

2564

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2565

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2566

// Load values from A x B

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2567

char16 alpha_ab = vload16(0, (__global char *)dst.ptr);

2568

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2569

// Load values from Matrix C

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2570

char16 c = vload16(0, (__global char *)src.ptr);

2571

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2572

// Computes alpha * axb + beta * c

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2573

char16 out = mla_sat_qs8x16(alpha_ab, (char16)BETA, c, FIXED_POINT_POSITION);

2574

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2575

// Store final result in axb matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2576

vstore16(out, 0, (__global char *)dst.ptr);

2577

}

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2578

2579

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices in 16 bit fixed point taking into account that the second matrix might be weighted by a scalar value beta:

2580

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

2581

* @note The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2582

*

2583

* @note: BETA must be passed in 16 bit fixed point format

2584

*

2585

* @param[in] src_ptr Pointer to the source matrix. Supported data types: QS16

2586

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

2587

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2588

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

2589

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2590

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

2591

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

2592

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2593

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2594

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2595

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2596

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2597

*/

2598

__kernel void gemm_ma_qs16(IMAGE_DECLARATION(src),

2599

IMAGE_DECLARATION(dst))

2600

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2601

// Compute source and destination addresses

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2602

Image src = CONVERT_TO_IMAGE_STRUCT(src);

2603

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2604

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2605

// Load values from A x B

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2606

short8 alpha_ab = vload8(0, (__global short *)dst.ptr);

2607

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2608

// Load values from Matrix C

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2609

short8 c = vload8(0, (__global short *)src.ptr);

2610

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2611

// Computes alpha * axb + beta * c

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2612

short8 out = mla_sat_qs16x8(alpha_ab, (short8)BETA, c, FIXED_POINT_POSITION);

2613

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2614

// Store final result in axb matrix

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

2615

vstore8(out, 0, (__global short *)dst.ptr);

2616

}

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2617

#endif // defined(FIXED_POINT_POSITION)

2618

#endif // defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2619

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2620

#if defined(WIDTH_VECTOR_A)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2621

/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer

2622

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

2623

* @note The width of A need to be passed at compile time using -DWIDTH_VECTOR_A

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2624

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

2625

* @note The input A and matrix B must not be reshaped

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2626

*

2627

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

2628

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

2629

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2630

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

2631

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2632

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2633

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2634

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

2635

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2636

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

2637

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2638

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

2639

* @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2640

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2641

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2642

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2643

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2644

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2645

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2646

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2647

*/

2648

__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),

2649

TENSOR3D_DECLARATION(src1),

2650

IMAGE_DECLARATION(dst))

2651

{

2652

int idx = get_global_id(0) * 4;

2653

int idy = get_global_id(1);

2654

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2655

// Compute the address for the vector A and matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2656

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));

2657

src_addr.s1 += idx * sizeof(float);

2658

2659

int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));

float4 acc = 0.0f;

Georgios Pinitas

2017-10-20 18:52:20 +0100

[diff] [blame]

2663

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2664

{

2665

float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));

2666

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

2667

float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));

2668

2669

acc += b0 * (float4)a0.s0;

2670

acc += b1 * (float4)a0.s1;

2671

}

2672

2673

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))

2674

{

2675

float a0 = *((__global float *)(src0_ptr + src_addr.s0));

2676

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

2677

2678

acc += b0 * (float4)a0;

2679

}

2680

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2681

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2682

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2683

2684

vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));

2685

}

Anton Lokhmotov