Blame - src/core/CL/cl_kernels/gemmlowp.cl - ml/ComputeLibrary

2017-11-21 10:57:50 +0000

[diff] [blame]

30

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

31

* @note The number of matrix B columns needs to be passed at compile time using -DCOLS_B: e.g. -DCOLS_B=1024

32

* @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)

33

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

34

*

35

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

36

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

37

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

38

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

39

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

40

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

41

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

42

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

43

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

44

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

45

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

46

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

47

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

48

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

49

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

50

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

51

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

52

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

53

*/

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

54

__kernel void gemmlowp_mm_interleaved_transposed_midgard(IMAGE_DECLARATION(src0),

55

IMAGE_DECLARATION(src1),

56

IMAGE_DECLARATION(dst))

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

57

{

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

58

int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;

59

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

60

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

61

// Offset

62

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

63

const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;

64

65

// src_addr_a = address of matrix A

66

// src_addr_b = address of matrix B

67

__global uchar *src_addr_a = (__global uchar *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);

68

__global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

69

70

// Compute end row address for matrix B

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

71

__global uchar *src_end_addr_b = src_addr_b + COLS_B;

72

73

src_addr_a += offset_row_a;

74

src_addr_b += offset_row_b;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

75

76

// Reset accumulators

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

int4 c00 = 0;

int4 c10 = 0;

int4 c20 = 0;

int4 c30 = 0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

81

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

82

for(; src_addr_b <= (src_end_addr_b - (int)(8 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * TRANSPOSE1XW_WIDTH_STEP)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

83

{

84

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

85

int4 a0 = convert_int4(vload4(0, src_addr_a));

86

int4 b0 = convert_int4(vload4(0, src_addr_b));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

87

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

88

c00 += (int4)a0.s0 * b0;

89

c10 += (int4)a0.s1 * b0;

90

c20 += (int4)a0.s2 * b0;

91

c30 += (int4)a0.s3 * b0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

92

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

93

a0 = convert_int4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));

94

b0 = convert_int4(vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

95

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

96

c00 += (int4)a0.s0 * b0;

97

c10 += (int4)a0.s1 * b0;

98

c20 += (int4)a0.s2 * b0;

99

c30 += (int4)a0.s3 * b0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

100

}

101

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

102

for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

103

{

104

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

105

int4 a0 = convert_int4(vload4(0, src_addr_a));

106

int4 b0 = convert_int4(vload4(0, src_addr_b));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

107

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

108

c00 += (int4)a0.s0 * b0;

109

c10 += (int4)a0.s1 * b0;

110

c20 += (int4)a0.s2 * b0;

111

c30 += (int4)a0.s3 * b0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

112

}

113

114

// Compute destination address

115

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

116

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

117

// Store 4x4 block

118

vstore4(c00, 0, (__global int *)(offset(&dst, 0, 0)));

119

vstore4(c10, 0, (__global int *)(offset(&dst, 0, 1)));

120

vstore4(c20, 0, (__global int *)(offset(&dst, 0, 2)));

121

vstore4(c30, 0, (__global int *)(offset(&dst, 0, 3)));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

122

}

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

123

124

/** This OpenCL kernel is optimized for Bifrost and computes the matrix multiplication between matrix A (src0) and matrix B (src1)

125

* Matrix A and matrix B must be reshaped respectively with @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel before running the matrix multiplication

126

*

127

* @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B

128

* @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)

129

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

130

*

131

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

132

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

133

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

134

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

135

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

136

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

137

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

138

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

139

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

140

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

141

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

142

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

143

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

144

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

145

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

146

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

147

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

148

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

149

*/

150

__kernel void gemmlowp_mm_interleaved_transposed_bifrost(IMAGE_DECLARATION(src0),

151

IMAGE_DECLARATION(src1),

152

IMAGE_DECLARATION(dst))

153

{

154

int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;

155

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

156

157

// Offset

158

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

159

const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;

160

161

// src_addr_a = address of matrix A

162

// src_addr_b = address of matrix B

163

__global uchar *src_addr_a = (__global uchar *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);

164

__global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);

165

166

// Compute end row address for matrix B

167

__global uchar *src_end_addr_b = src_addr_b + COLS_B;

168

169

src_addr_a += offset_row_a;

170

src_addr_b += offset_row_b;

171

172

// Reset accumulators

uint c00 = 0;

uint c01 = 0;

uint c02 = 0;

uint c03 = 0;

uint c10 = 0;

uint c11 = 0;

uint c12 = 0;

uint c13 = 0;

uint c20 = 0;

uint c21 = 0;

uint c22 = 0;

uint c23 = 0;

uint c30 = 0;

uint c31 = 0;

uint c32 = 0;

uint c33 = 0;

#if MULT_INTERLEAVE4X4_HEIGHT == 1

191

for(; src_addr_b <= (src_end_addr_b - (int)(32 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += (32 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (32 * TRANSPOSE1XW_WIDTH_STEP))

192

{

Michalis Spyrou

2018-01-15 14:39:13 +0000

[diff] [blame]

193

#if ARM_COMPUTE_OPENCL_DOT8_ENABLED

194

// Load values from matrix A (interleaved) and matrix B (transposed)

195

uchar16 a0 = vload16(0, src_addr_a);

196

uchar4 b0 = vload4(0, src_addr_b);

197

uchar4 b1 = vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP);

198

uchar4 b2 = vload4(0, src_addr_b + 8 * TRANSPOSE1XW_WIDTH_STEP);

199

uchar4 b3 = vload4(0, src_addr_b + 12 * TRANSPOSE1XW_WIDTH_STEP);

200

201

// Accumulate

202

c00 += arm_dot((uchar4)(a0.s0, a0.s4, a0.s8, a0.sC), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0));

203

c01 += arm_dot((uchar4)(a0.s0, a0.s4, a0.s8, a0.sC), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1));

204

c02 += arm_dot((uchar4)(a0.s0, a0.s4, a0.s8, a0.sC), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2));

205

c03 += arm_dot((uchar4)(a0.s0, a0.s4, a0.s8, a0.sC), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3));

206

207

c10 += arm_dot((uchar4)(a0.s1, a0.s5, a0.s9, a0.sD), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0));

208

c11 += arm_dot((uchar4)(a0.s1, a0.s5, a0.s9, a0.sD), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1));

209

c12 += arm_dot((uchar4)(a0.s1, a0.s5, a0.s9, a0.sD), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2));

210

c13 += arm_dot((uchar4)(a0.s1, a0.s5, a0.s9, a0.sD), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3));

211

212

c20 += arm_dot((uchar4)(a0.s2, a0.s6, a0.sA, a0.sE), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0));

213

c21 += arm_dot((uchar4)(a0.s2, a0.s6, a0.sA, a0.sE), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1));

214

c22 += arm_dot((uchar4)(a0.s2, a0.s6, a0.sA, a0.sE), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2));

215

c23 += arm_dot((uchar4)(a0.s2, a0.s6, a0.sA, a0.sE), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3));

216

217

c30 += arm_dot((uchar4)(a0.s3, a0.s7, a0.sB, a0.sF), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0));

218

c31 += arm_dot((uchar4)(a0.s3, a0.s7, a0.sB, a0.sF), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1));

219

c32 += arm_dot((uchar4)(a0.s3, a0.s7, a0.sB, a0.sF), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2));

220

c33 += arm_dot((uchar4)(a0.s3, a0.s7, a0.sB, a0.sF), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3));

221

222

// Load values from matrix A (interleaved) and matrix B (transposed)

223

a0 = vload16(0, src_addr_a + 16);

224

b0 = vload4(0, src_addr_b + 16 * TRANSPOSE1XW_WIDTH_STEP);

225

b1 = vload4(0, src_addr_b + 20 * TRANSPOSE1XW_WIDTH_STEP);

226

b2 = vload4(0, src_addr_b + 24 * TRANSPOSE1XW_WIDTH_STEP);

227

b3 = vload4(0, src_addr_b + 28 * TRANSPOSE1XW_WIDTH_STEP);

228

229

// Accumulate

230

c00 += arm_dot((uchar4)(a0.s0, a0.s4, a0.s8, a0.sC), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0));

231

c01 += arm_dot((uchar4)(a0.s0, a0.s4, a0.s8, a0.sC), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1));

232

c02 += arm_dot((uchar4)(a0.s0, a0.s4, a0.s8, a0.sC), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2));

233

c03 += arm_dot((uchar4)(a0.s0, a0.s4, a0.s8, a0.sC), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3));

234

235

c10 += arm_dot((uchar4)(a0.s1, a0.s5, a0.s9, a0.sD), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0));

236

c11 += arm_dot((uchar4)(a0.s1, a0.s5, a0.s9, a0.sD), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1));

237

c12 += arm_dot((uchar4)(a0.s1, a0.s5, a0.s9, a0.sD), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2));

238

c13 += arm_dot((uchar4)(a0.s1, a0.s5, a0.s9, a0.sD), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3));

239

240

c20 += arm_dot((uchar4)(a0.s2, a0.s6, a0.sA, a0.sE), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0));

241

c21 += arm_dot((uchar4)(a0.s2, a0.s6, a0.sA, a0.sE), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1));

242

c22 += arm_dot((uchar4)(a0.s2, a0.s6, a0.sA, a0.sE), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2));

243

c23 += arm_dot((uchar4)(a0.s2, a0.s6, a0.sA, a0.sE), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3));

244

245

c30 += arm_dot((uchar4)(a0.s3, a0.s7, a0.sB, a0.sF), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0));

246

c31 += arm_dot((uchar4)(a0.s3, a0.s7, a0.sB, a0.sF), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1));

247

c32 += arm_dot((uchar4)(a0.s3, a0.s7, a0.sB, a0.sF), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2));

248

c33 += arm_dot((uchar4)(a0.s3, a0.s7, a0.sB, a0.sF), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3));

249

#else // ARM_COMPUTE_OPENCL_DOT8_ENABLED

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

250

// Load values from matrix A (interleaved) and matrix B (transposed)

251

uchar16 a0 = vload16(0, src_addr_a);

252

uchar4 b0 = vload4(0, src_addr_b);

253

254

c00 += (ushort)a0.s0 * b0.s0;

255

c01 += (ushort)a0.s0 * b0.s1;

256

c02 += (ushort)a0.s0 * b0.s2;

257

c03 += (ushort)a0.s0 * b0.s3;

258

259

c10 += (ushort)a0.s1 * b0.s0;

260

c11 += (ushort)a0.s1 * b0.s1;

261

c12 += (ushort)a0.s1 * b0.s2;

262

c13 += (ushort)a0.s1 * b0.s3;

263

264

c20 += (ushort)a0.s2 * b0.s0;

265

c21 += (ushort)a0.s2 * b0.s1;

266

c22 += (ushort)a0.s2 * b0.s2;

267

c23 += (ushort)a0.s2 * b0.s3;

268

269

c30 += (ushort)a0.s3 * b0.s0;

270

c31 += (ushort)a0.s3 * b0.s1;

271

c32 += (ushort)a0.s3 * b0.s2;

272

c33 += (ushort)a0.s3 * b0.s3;

273

274

// Load values from matrix B (transposed)

275

b0 = vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP);

276

277

c00 += (ushort)a0.s4 * b0.s0;

278

c01 += (ushort)a0.s4 * b0.s1;

279

c02 += (ushort)a0.s4 * b0.s2;

280

c03 += (ushort)a0.s4 * b0.s3;

281

282

c10 += (ushort)a0.s5 * b0.s0;

283

c11 += (ushort)a0.s5 * b0.s1;

284

c12 += (ushort)a0.s5 * b0.s2;

285

c13 += (ushort)a0.s5 * b0.s3;

286

287

c20 += (ushort)a0.s6 * b0.s0;

288

c21 += (ushort)a0.s6 * b0.s1;

289

c22 += (ushort)a0.s6 * b0.s2;

290

c23 += (ushort)a0.s6 * b0.s3;

291

292

c30 += (ushort)a0.s7 * b0.s0;

293

c31 += (ushort)a0.s7 * b0.s1;

294

c32 += (ushort)a0.s7 * b0.s2;

295

c33 += (ushort)a0.s7 * b0.s3;

296

297

// Load values from matrix B (transposed)

298

b0 = vload4(0, src_addr_b + 8 * TRANSPOSE1XW_WIDTH_STEP);

299

300

c00 += (ushort)a0.s8 * b0.s0;

301

c01 += (ushort)a0.s8 * b0.s1;

302

c02 += (ushort)a0.s8 * b0.s2;

303

c03 += (ushort)a0.s8 * b0.s3;

304

305

c10 += (ushort)a0.s9 * b0.s0;

306

c11 += (ushort)a0.s9 * b0.s1;

307

c12 += (ushort)a0.s9 * b0.s2;

308

c13 += (ushort)a0.s9 * b0.s3;

309

310

c20 += (ushort)a0.sA * b0.s0;

311

c21 += (ushort)a0.sA * b0.s1;

312

c22 += (ushort)a0.sA * b0.s2;

313

c23 += (ushort)a0.sA * b0.s3;

314

315

c30 += (ushort)a0.sB * b0.s0;

316

c31 += (ushort)a0.sB * b0.s1;

317

c32 += (ushort)a0.sB * b0.s2;

318

c33 += (ushort)a0.sB * b0.s3;

319

320

// Load values from matrix B (transposed)

321

b0 = vload4(0, src_addr_b + 12 * TRANSPOSE1XW_WIDTH_STEP);

322

323

c00 += (ushort)a0.sC * b0.s0;

324

c01 += (ushort)a0.sC * b0.s1;

325

c02 += (ushort)a0.sC * b0.s2;

326

c03 += (ushort)a0.sC * b0.s3;

327

328

c10 += (ushort)a0.sD * b0.s0;

329

c11 += (ushort)a0.sD * b0.s1;

330

c12 += (ushort)a0.sD * b0.s2;

331

c13 += (ushort)a0.sD * b0.s3;

332

333

c20 += (ushort)a0.sE * b0.s0;

334

c21 += (ushort)a0.sE * b0.s1;

335

c22 += (ushort)a0.sE * b0.s2;

336

c23 += (ushort)a0.sE * b0.s3;

337

338

c30 += (ushort)a0.sF * b0.s0;

339

c31 += (ushort)a0.sF * b0.s1;

340

c32 += (ushort)a0.sF * b0.s2;

341

c33 += (ushort)a0.sF * b0.s3;

342

343

// Load values from matrix A (interleaved) and matrix B (transposed)

344

a0 = vload16(0, src_addr_a + 16);

345

b0 = vload4(0, src_addr_b + 16 * TRANSPOSE1XW_WIDTH_STEP);

346

347

c00 += (ushort)a0.s0 * b0.s0;

348

c01 += (ushort)a0.s0 * b0.s1;

349

c02 += (ushort)a0.s0 * b0.s2;

350

c03 += (ushort)a0.s0 * b0.s3;

351

352

c10 += (ushort)a0.s1 * b0.s0;

353

c11 += (ushort)a0.s1 * b0.s1;

354

c12 += (ushort)a0.s1 * b0.s2;

355

c13 += (ushort)a0.s1 * b0.s3;

356

357

c20 += (ushort)a0.s2 * b0.s0;

358

c21 += (ushort)a0.s2 * b0.s1;

359

c22 += (ushort)a0.s2 * b0.s2;

360

c23 += (ushort)a0.s2 * b0.s3;

361

362

c30 += (ushort)a0.s3 * b0.s0;

363

c31 += (ushort)a0.s3 * b0.s1;

364

c32 += (ushort)a0.s3 * b0.s2;

365

c33 += (ushort)a0.s3 * b0.s3;

366

367

// Load values from matrix B (transposed)

368

b0 = vload4(0, src_addr_b + 20 * TRANSPOSE1XW_WIDTH_STEP);

369

370

c00 += (ushort)a0.s4 * b0.s0;

371

c01 += (ushort)a0.s4 * b0.s1;

372

c02 += (ushort)a0.s4 * b0.s2;

373

c03 += (ushort)a0.s4 * b0.s3;

374

375

c10 += (ushort)a0.s5 * b0.s0;

376

c11 += (ushort)a0.s5 * b0.s1;

377

c12 += (ushort)a0.s5 * b0.s2;

378

c13 += (ushort)a0.s5 * b0.s3;

379

380

c20 += (ushort)a0.s6 * b0.s0;

381

c21 += (ushort)a0.s6 * b0.s1;

382

c22 += (ushort)a0.s6 * b0.s2;

383

c23 += (ushort)a0.s6 * b0.s3;

384

385

c30 += (ushort)a0.s7 * b0.s0;

386

c31 += (ushort)a0.s7 * b0.s1;

387

c32 += (ushort)a0.s7 * b0.s2;

388

c33 += (ushort)a0.s7 * b0.s3;

389

390

// Load values from matrix B (transposed)

391

b0 = vload4(0, src_addr_b + 24 * TRANSPOSE1XW_WIDTH_STEP);

392

393

c00 += (ushort)a0.s8 * b0.s0;

394

c01 += (ushort)a0.s8 * b0.s1;

395

c02 += (ushort)a0.s8 * b0.s2;

396

c03 += (ushort)a0.s8 * b0.s3;

397

398

c10 += (ushort)a0.s9 * b0.s0;

399

c11 += (ushort)a0.s9 * b0.s1;

400

c12 += (ushort)a0.s9 * b0.s2;

401

c13 += (ushort)a0.s9 * b0.s3;

402

403

c20 += (ushort)a0.sA * b0.s0;

404

c21 += (ushort)a0.sA * b0.s1;

405

c22 += (ushort)a0.sA * b0.s2;

406

c23 += (ushort)a0.sA * b0.s3;

407

408

c30 += (ushort)a0.sB * b0.s0;

409

c31 += (ushort)a0.sB * b0.s1;

410

c32 += (ushort)a0.sB * b0.s2;

411

c33 += (ushort)a0.sB * b0.s3;

412

413

// Load values from matrix B (transposed)

414

b0 = vload4(0, src_addr_b + 28 * TRANSPOSE1XW_WIDTH_STEP);

415

416

c00 += (ushort)a0.sC * b0.s0;

417

c01 += (ushort)a0.sC * b0.s1;

418

c02 += (ushort)a0.sC * b0.s2;

419

c03 += (ushort)a0.sC * b0.s3;

420

421

c10 += (ushort)a0.sD * b0.s0;

422

c11 += (ushort)a0.sD * b0.s1;

423

c12 += (ushort)a0.sD * b0.s2;

424

c13 += (ushort)a0.sD * b0.s3;

425

426

c20 += (ushort)a0.sE * b0.s0;

427

c21 += (ushort)a0.sE * b0.s1;

428

c22 += (ushort)a0.sE * b0.s2;

429

c23 += (ushort)a0.sE * b0.s3;

430

431

c30 += (ushort)a0.sF * b0.s0;

432

c31 += (ushort)a0.sF * b0.s1;

433

c32 += (ushort)a0.sF * b0.s2;

434

c33 += (ushort)a0.sF * b0.s3;

Michalis Spyrou

2018-01-15 14:39:13 +0000

[diff] [blame]

435

#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

436

}

437

#endif // MULT_INTERLEAVE4X4_HEIGHT == 1

438

439

for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))

440

{

441

// Load values from matrix A (interleaved) and matrix B (transposed)

442

uchar4 a0 = vload4(0, src_addr_a);

443

uchar4 b0 = vload4(0, src_addr_b);

444

445

c00 += (ushort)a0.s0 * b0.s0;

446

c01 += (ushort)a0.s0 * b0.s1;

447

c02 += (ushort)a0.s0 * b0.s2;

448

c03 += (ushort)a0.s0 * b0.s3;

449

450

c10 += (ushort)a0.s1 * b0.s0;

451

c11 += (ushort)a0.s1 * b0.s1;

452

c12 += (ushort)a0.s1 * b0.s2;

453

c13 += (ushort)a0.s1 * b0.s3;

454

455

c20 += (ushort)a0.s2 * b0.s0;

456

c21 += (ushort)a0.s2 * b0.s1;

457

c22 += (ushort)a0.s2 * b0.s2;

458

c23 += (ushort)a0.s2 * b0.s3;

459

460

c30 += (ushort)a0.s3 * b0.s0;

461

c31 += (ushort)a0.s3 * b0.s1;

462

c32 += (ushort)a0.s3 * b0.s2;

463

c33 += (ushort)a0.s3 * b0.s3;

464

}

465

466

// Compute destination address

467

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

468

469

// Store 4x4 block

470

vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(offset(&dst, 0, 0)));

471

vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(offset(&dst, 0, 1)));

472

vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(offset(&dst, 0, 2)));

473

vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(offset(&dst, 0, 3)));

474

}

475

#endif // defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

476

477

#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)

478

#define VECTOR_UCHAR VEC_DATA_TYPE(uchar, NUM_ELEMS_PROCESSED_PER_THREAD_X)

479

#define VECTOR_UINT VEC_DATA_TYPE(uint, NUM_ELEMS_PROCESSED_PER_THREAD_X)

480

#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)

481

/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

482

*

483

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

484

*

485

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

486

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

487

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

488

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

489

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

490

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

491

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

492

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

493

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

494

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

495

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

496

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

497

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

498

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

499

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

500

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

501

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

502

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

503

*/

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

504

__kernel void gemmlowp_mm_midgard(IMAGE_DECLARATION(src0),

505

IMAGE_DECLARATION(src1),

506

IMAGE_DECLARATION(dst))

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

507

{

508

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

509

510

// Compute starting address for matrix A and Matrix B

511

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

512

513

// Update address for the matrix A

514

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

515

516

// Update address for the matrix B

517

src_addr.s1 += idx;

518

519

int end_row_vec_a = src_addr.s0 + COLS_A;

520

521

VECTOR_UINT acc0 = 0;

522

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

523

VECTOR_UINT acc1 = 0;

524

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

525

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

526

VECTOR_UINT acc2 = 0;

527

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

528

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

529

VECTOR_UINT acc3 = 0;

530

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

531

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

532

VECTOR_UINT acc4 = 0;

533

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

534

535

for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))

536

{

537

// Load values from matrix A

538

uchar2 a0 = vload2(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);

539

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

540

uchar2 a1 = vload2(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);

541

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

542

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

543

uchar2 a2 = vload2(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);

544

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

545

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

546

uchar2 a3 = vload2(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);

547

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

548

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

549

uchar2 a4 = vload2(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);

550

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

551

// Load values from matrix B

552

VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);

553

VECTOR_UCHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1 + src1_stride_y);

554

555

// Accumulate

556

acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0.s0;

557

acc0 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a0.s1;

558

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

559

acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1.s0;

560

acc1 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a1.s1;

561

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

562

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

563

acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2.s0;

564

acc2 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a2.s1;

565

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

566

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

567

acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3.s0;

568

acc3 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a3.s1;

569

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

570

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

571

acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4.s0;

572

acc4 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a4.s1;

573

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

574

}

575

576

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))

577

{

578

// Load values from matrix A

579

uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);

580

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

581

uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);

582

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

583

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

584

uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);

585

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

586

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

587

uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);

588

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

589

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

590

uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);

591

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

592

// Load values from matrix B

593

VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);

594

595

// Accumulate

596

acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0;

597

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

598

acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1;

599

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

600

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

601

acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2;

602

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

603

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

604

acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3;

605

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

606

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

607

acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4;

608

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

609

}

610

611

// Compute destination address

612

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

613

614

// Store the result

615

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

616

(CONVERT(acc0, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 0)));

617

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

618

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

619

(CONVERT(acc1, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 1)));

620

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

621

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

622

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

623

(CONVERT(acc2, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 2)));

624

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

625

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

626

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

627

(CONVERT(acc3, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 3)));

628

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

629

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

630

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

631

(CONVERT(acc4, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 4)));

632

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

633

}

634

635

/** OpenCL kernel optimized for Bifrost architectures that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

636

*

637

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

638

*

639

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

640

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

641

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

642

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

643

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

644

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

645

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

646

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

647

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

648

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

649

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

650

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

651

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

652

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

653

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

654

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

655

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

656

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

657

*/

658

__kernel void gemmlowp_mm_bifrost(IMAGE_DECLARATION(src0),

659

IMAGE_DECLARATION(src1),

660

IMAGE_DECLARATION(dst))

661

{

662

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

663

664

// Compute starting address for matrix A and Matrix B

665

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

666

667

// Update address for the matrix A

668

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

669

670

// Update address for the matrix B

671

src_addr.s1 += idx;

672

673

int end_row_vec_a = src_addr.s0 + COLS_A;

uint acc00 = 0;

uint acc01 = 0;

uint acc02 = 0;

uint acc03 = 0;

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

uint acc10 = 0;

uint acc11 = 0;

uint acc12 = 0;

uint acc13 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

685

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

uint acc20 = 0;

uint acc21 = 0;

uint acc22 = 0;

uint acc23 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

691

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

uint acc30 = 0;

uint acc31 = 0;

uint acc32 = 0;

uint acc33 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

697

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

uint acc40 = 0;

uint acc41 = 0;

uint acc42 = 0;

uint acc43 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

703

704

for(; src_addr.s0 <= (end_row_vec_a - 4); src_addr += (int2)(4, 4 * src1_stride_y))

705

{

706

// Load values from matrix A

707

uchar4 a0 = vload4(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);

708

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

709

uchar4 a1 = vload4(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);

710

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

711

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

712

uchar4 a2 = vload4(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);

713

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

714

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

715

uchar4 a3 = vload4(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);

716

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

717

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

718

uchar4 a4 = vload4(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);

719

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

720

// Load values from matrix B

721

uchar4 b0 = vload4(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);

722

uchar4 b1 = vload4(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);

723

uchar4 b2 = vload4(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);

724

uchar4 b3 = vload4(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);

725

726

{

Michalis Spyrou

2018-01-15 14:39:13 +0000

[diff] [blame]

727

#if ARM_COMPUTE_OPENCL_DOT8_ENABLED

728

// Accumulate

729

acc00 += arm_dot((uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), a0);

730

acc01 += arm_dot((uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), a0);

731

acc02 += arm_dot((uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), a0);

732

acc03 += arm_dot((uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), a0);

733

#else // ARM_COMPUTE_OPENCL_DOT8_ENABLED

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

734

// Accumulate

735

ushort tmp0 = (ushort)b0.s0 * (ushort)a0.s0;

736

ushort tmp1 = (ushort)b0.s1 * (ushort)a0.s0;

737

ushort tmp2 = (ushort)b0.s2 * (ushort)a0.s0;

738

ushort tmp3 = (ushort)b0.s3 * (ushort)a0.s0;

739

740

ushort tmp4 = (ushort)b1.s0 * (ushort)a0.s1;

741

ushort tmp5 = (ushort)b1.s1 * (ushort)a0.s1;

742

ushort tmp6 = (ushort)b1.s2 * (ushort)a0.s1;

743

ushort tmp7 = (ushort)b1.s3 * (ushort)a0.s1;

744

745

ushort tmp8 = (ushort)b2.s0 * (ushort)a0.s2;

746

ushort tmp9 = (ushort)b2.s1 * (ushort)a0.s2;

747

ushort tmpA = (ushort)b2.s2 * (ushort)a0.s2;

748

ushort tmpB = (ushort)b2.s3 * (ushort)a0.s2;

749

750

ushort tmpC = (ushort)b3.s0 * (ushort)a0.s3;

751

ushort tmpD = (ushort)b3.s1 * (ushort)a0.s3;

752

ushort tmpE = (ushort)b3.s2 * (ushort)a0.s3;

753

ushort tmpF = (ushort)b3.s3 * (ushort)a0.s3;

754

755

acc00 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

756

acc01 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

757

acc02 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

758

acc03 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

Michalis Spyrou

2018-01-15 14:39:13 +0000

[diff] [blame]

759

#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

760

}

761

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

762

{

Michalis Spyrou

2018-01-15 14:39:13 +0000

[diff] [blame]

763

#if ARM_COMPUTE_OPENCL_DOT8_ENABLED

764

// Accumulate

765

acc10 += arm_dot((uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), a1);

766

acc11 += arm_dot((uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), a1);

767

acc12 += arm_dot((uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), a1);

768

acc13 += arm_dot((uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), a1);

769

#else // ARM_COMPUTE_OPENCL_DOT8_ENABLED

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

770

// Accumulate

771

ushort tmp0 = (ushort)b0.s0 * (ushort)a1.s0;

772

ushort tmp1 = (ushort)b0.s1 * (ushort)a1.s0;

773

ushort tmp2 = (ushort)b0.s2 * (ushort)a1.s0;

774

ushort tmp3 = (ushort)b0.s3 * (ushort)a1.s0;

775

776

ushort tmp4 = (ushort)b1.s0 * (ushort)a1.s1;

777

ushort tmp5 = (ushort)b1.s1 * (ushort)a1.s1;

778

ushort tmp6 = (ushort)b1.s2 * (ushort)a1.s1;

779

ushort tmp7 = (ushort)b1.s3 * (ushort)a1.s1;

780

781

ushort tmp8 = (ushort)b2.s0 * (ushort)a1.s2;

782

ushort tmp9 = (ushort)b2.s1 * (ushort)a1.s2;

783

ushort tmpA = (ushort)b2.s2 * (ushort)a1.s2;

784

ushort tmpB = (ushort)b2.s3 * (ushort)a1.s2;

785

786

ushort tmpC = (ushort)b3.s0 * (ushort)a1.s3;

787

ushort tmpD = (ushort)b3.s1 * (ushort)a1.s3;

788

ushort tmpE = (ushort)b3.s2 * (ushort)a1.s3;

789

ushort tmpF = (ushort)b3.s3 * (ushort)a1.s3;

790

791

acc10 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

792

acc11 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

793

acc12 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

794

acc13 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

Michalis Spyrou

2018-01-15 14:39:13 +0000

[diff] [blame]

795

#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

796

}

797

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

798

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

799

{

Michalis Spyrou

2018-01-15 14:39:13 +0000

[diff] [blame]

800

#if ARM_COMPUTE_OPENCL_DOT8_ENABLED

801

// Accumulate

802

acc20 += arm_dot((uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), a2);

803

acc21 += arm_dot((uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), a2);

804

acc22 += arm_dot((uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), a2);

805

acc23 += arm_dot((uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), a2);

806

#else // ARM_COMPUTE_OPENCL_DOT8_ENABLED

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

807

// Accumulate

808

ushort tmp0 = (ushort)b0.s0 * (ushort)a2.s0;

809

ushort tmp1 = (ushort)b0.s1 * (ushort)a2.s0;

810

ushort tmp2 = (ushort)b0.s2 * (ushort)a2.s0;

811

ushort tmp3 = (ushort)b0.s3 * (ushort)a2.s0;

812

813

ushort tmp4 = (ushort)b1.s0 * (ushort)a2.s1;

814

ushort tmp5 = (ushort)b1.s1 * (ushort)a2.s1;

815

ushort tmp6 = (ushort)b1.s2 * (ushort)a2.s1;

816

ushort tmp7 = (ushort)b1.s3 * (ushort)a2.s1;

817

818

ushort tmp8 = (ushort)b2.s0 * (ushort)a2.s2;

819

ushort tmp9 = (ushort)b2.s1 * (ushort)a2.s2;

820

ushort tmpA = (ushort)b2.s2 * (ushort)a2.s2;

821

ushort tmpB = (ushort)b2.s3 * (ushort)a2.s2;

822

823

ushort tmpC = (ushort)b3.s0 * (ushort)a2.s3;

824

ushort tmpD = (ushort)b3.s1 * (ushort)a2.s3;

825

ushort tmpE = (ushort)b3.s2 * (ushort)a2.s3;

826

ushort tmpF = (ushort)b3.s3 * (ushort)a2.s3;

827

828

acc20 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

829

acc21 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

830

acc22 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

831

acc23 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

Michalis Spyrou

2018-01-15 14:39:13 +0000

[diff] [blame]

832

#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

833

}

834

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

835

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

836

{

Michalis Spyrou

2018-01-15 14:39:13 +0000

[diff] [blame]

837

#if ARM_COMPUTE_OPENCL_DOT8_ENABLED

838

// Accumulate

839

acc30 += arm_dot((uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), a3);

840

acc31 += arm_dot((uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), a3);

841

acc32 += arm_dot((uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), a3);

842

acc33 += arm_dot((uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), a3);

843

#else // ARM_COMPUTE_OPENCL_DOT8_ENABLED

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

844

// Accumulate

845

ushort tmp0 = (ushort)b0.s0 * (ushort)a3.s0;

846

ushort tmp1 = (ushort)b0.s1 * (ushort)a3.s0;

847

ushort tmp2 = (ushort)b0.s2 * (ushort)a3.s0;

848

ushort tmp3 = (ushort)b0.s3 * (ushort)a3.s0;

849

850

ushort tmp4 = (ushort)b1.s0 * (ushort)a3.s1;

851

ushort tmp5 = (ushort)b1.s1 * (ushort)a3.s1;

852

ushort tmp6 = (ushort)b1.s2 * (ushort)a3.s1;

853

ushort tmp7 = (ushort)b1.s3 * (ushort)a3.s1;

854

855

ushort tmp8 = (ushort)b2.s0 * (ushort)a3.s2;

856

ushort tmp9 = (ushort)b2.s1 * (ushort)a3.s2;

857

ushort tmpA = (ushort)b2.s2 * (ushort)a3.s2;

858

ushort tmpB = (ushort)b2.s3 * (ushort)a3.s2;

859

860

ushort tmpC = (ushort)b3.s0 * (ushort)a3.s3;

861

ushort tmpD = (ushort)b3.s1 * (ushort)a3.s3;

862

ushort tmpE = (ushort)b3.s2 * (ushort)a3.s3;

863

ushort tmpF = (ushort)b3.s3 * (ushort)a3.s3;

864

865

acc30 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

866

acc31 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

867

acc32 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

868

acc33 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

Michalis Spyrou

2018-01-15 14:39:13 +0000

[diff] [blame]

869

#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

870

}

871

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

872

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

873

{

Michalis Spyrou

2018-01-15 14:39:13 +0000

[diff] [blame]

874

#if ARM_COMPUTE_OPENCL_DOT8_ENABLED

875

// Accumulate

876

acc40 += arm_dot((uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), a4);

877

acc41 += arm_dot((uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), a4);

878

acc42 += arm_dot((uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), a4);

879

acc43 += arm_dot((uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), a4);

880

#else // ARM_COMPUTE_OPENCL_DOT8_ENABLED

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

881

// Accumulate

882

ushort tmp0 = (ushort)b0.s0 * (ushort)a4.s0;

883

ushort tmp1 = (ushort)b0.s1 * (ushort)a4.s0;

884

ushort tmp2 = (ushort)b0.s2 * (ushort)a4.s0;

885

ushort tmp3 = (ushort)b0.s3 * (ushort)a4.s0;

886

887

ushort tmp4 = (ushort)b1.s0 * (ushort)a4.s1;

888

ushort tmp5 = (ushort)b1.s1 * (ushort)a4.s1;

889

ushort tmp6 = (ushort)b1.s2 * (ushort)a4.s1;

890

ushort tmp7 = (ushort)b1.s3 * (ushort)a4.s1;

891

892

ushort tmp8 = (ushort)b2.s0 * (ushort)a4.s2;

893

ushort tmp9 = (ushort)b2.s1 * (ushort)a4.s2;

894

ushort tmpA = (ushort)b2.s2 * (ushort)a4.s2;

895

ushort tmpB = (ushort)b2.s3 * (ushort)a4.s2;

896

897

ushort tmpC = (ushort)b3.s0 * (ushort)a4.s3;

898

ushort tmpD = (ushort)b3.s1 * (ushort)a4.s3;

899

ushort tmpE = (ushort)b3.s2 * (ushort)a4.s3;

900

ushort tmpF = (ushort)b3.s3 * (ushort)a4.s3;

901

902

acc40 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

903

acc41 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

904

acc42 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

905

acc43 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

Michalis Spyrou

2018-01-15 14:39:13 +0000

[diff] [blame]

906

#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

907

}

908

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

909

}

910

911

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))

912

{

913

// Load values from matrix A

914

uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);

915

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

916

uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);

917

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

918

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

919

uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);

920

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

921

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

922

uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);

923

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

924

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

925

uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);

926

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

927

// Load values from matrix B

928

uchar4 b0 = vload4(0, src1_ptr + src_addr.s1);

// Accumulate

{

// Accumulate

ushort tmp0 = (ushort)b0.s0 * (ushort)a0;

934

ushort tmp1 = (ushort)b0.s1 * (ushort)a0;

935

ushort tmp2 = (ushort)b0.s2 * (ushort)a0;

936

ushort tmp3 = (ushort)b0.s3 * (ushort)a0;

937

938

acc00 += ((uint)tmp0);

939

acc01 += ((uint)tmp1);

940

acc02 += ((uint)tmp2);

941

acc03 += ((uint)tmp3);

942

}

943

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

944

{

945

// Accumulate

946

ushort tmp0 = (ushort)b0.s0 * (ushort)a1;

947

ushort tmp1 = (ushort)b0.s1 * (ushort)a1;

948

ushort tmp2 = (ushort)b0.s2 * (ushort)a1;

949

ushort tmp3 = (ushort)b0.s3 * (ushort)a1;

950

951

acc10 += ((uint)tmp0);

952

acc11 += ((uint)tmp1);

953

acc12 += ((uint)tmp2);

954

acc13 += ((uint)tmp3);

955

}

956

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

957

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

958

{

959

// Accumulate

960

ushort tmp0 = (ushort)b0.s0 * (ushort)a2;

961

ushort tmp1 = (ushort)b0.s1 * (ushort)a2;

962

ushort tmp2 = (ushort)b0.s2 * (ushort)a2;

963

ushort tmp3 = (ushort)b0.s3 * (ushort)a2;

964

965

acc20 += ((uint)tmp0);

966

acc21 += ((uint)tmp1);

967

acc22 += ((uint)tmp2);

968

acc23 += ((uint)tmp3);

969

}

970

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

971

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

972

{

973

// Accumulate

974

ushort tmp0 = (ushort)b0.s0 * (ushort)a3;

975

ushort tmp1 = (ushort)b0.s1 * (ushort)a3;

976

ushort tmp2 = (ushort)b0.s2 * (ushort)a3;

977

ushort tmp3 = (ushort)b0.s3 * (ushort)a3;

978

979

acc30 += ((uint)tmp0);

980

acc31 += ((uint)tmp1);

981

acc32 += ((uint)tmp2);

982

acc33 += ((uint)tmp3);

983

}

984

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

985

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

986

{

987

// Accumulate

988

ushort tmp0 = (ushort)b0.s0 * (ushort)a4;

989

ushort tmp1 = (ushort)b0.s1 * (ushort)a4;

990

ushort tmp2 = (ushort)b0.s2 * (ushort)a4;

991

ushort tmp3 = (ushort)b0.s3 * (ushort)a4;

992

993

acc40 += ((uint)tmp0);

994

acc41 += ((uint)tmp1);

995

acc42 += ((uint)tmp2);

996

acc43 += ((uint)tmp3);

997

}

998

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

999

}

1000

1001

// Compute destination address

1002

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1003

1004

// Store the result

1005

vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(offset(&dst, 0, 0)));

1006

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1007

vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(offset(&dst, 0, 1)));

1008

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1009

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1010

vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(offset(&dst, 0, 2)));

1011

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1012

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1013

vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(offset(&dst, 0, 3)));

1014

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1015

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1016

vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(offset(&dst, 0, 4)));

1017

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1018

}

1019

#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)

1020

1021

#if defined(COLS_A)

1022

/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.

1023

*

1024

* @note This stage is needed to handle the offset of matrix product

1025

* https://github.com/google/gemmlowp/blob/master/doc/low-precision.md

1026

*

1027

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

1028

*

1029

* @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8

1030

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1031

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1032

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1033

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1034

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1035

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1036

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1037

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32

1038

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1039

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1040

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1041

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1042

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1043

*/

1044

__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),

1045

IMAGE_DECLARATION(dst))

1046

{

1047

// Compute source and destination addresses

1048

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

1049

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1050

1051

uint4 sum_row_u32 = (uint4)0;

1052

uint sum_row = 0;

1053

1054

__global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);

int i = 0;

// This for loop performs 16 accumulations

1059

for(; i <= ((int)COLS_A - 16); i += 16)

1060

{

1061

const uchar16 a0_u8 = vload16(0, matrix_a + i);

1062

1063

sum_row_u32 += convert_uint4(a0_u8.s0123) + convert_uint4(a0_u8.s4567) + convert_uint4(a0_u8.s89AB) + convert_uint4(a0_u8.sCDEF);

1064

}

1065

1066

// This for loop performs the leftover accumulations

1067

for(; i < COLS_A; ++i)

1068

{

1069

sum_row += matrix_a[i];

1070

}

1071

1072

sum_row += sum_row_u32.s0 + sum_row_u32.s1 + sum_row_u32.s2 + sum_row_u32.s3;

1073

1074

*((__global int *)dst.ptr) = (int)sum_row;

1075

}

1076

#endif // defined(COLS_A)

1077

1078

#if defined(COLS_B) && defined(ROWS_B)

1079

/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.

1080

*

1081

* @note This stage is needed to handle the offset of matrix product

1082

* https://github.com/google/gemmlowp/blob/master/doc/low-precision.md

1083

*

1084

* @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B

1085

*

1086

* @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8

1087

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1088

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1089

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1090

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1091

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1092

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1093

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1094

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32

1095

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1096

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1097

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1098

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1099

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1100

*/

1101

__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),

1102

IMAGE_DECLARATION(dst))

1103

{

1104

// Compute source and destination addresses

1105

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

1106

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1107

1108

uint16 sum_col_u32 = (uint16)0;

1109

1110

__global const uchar *matrix_b = (__global const uchar *)(src.ptr + get_global_id(1) * src_stride_z);

1111

1112

int i = 0;

1113

// This for loop performs 4 accumulations

1114

for(; i <= ((int)ROWS_B - 4); i += 4)

1115

{

1116

const uchar16 b0_u8 = vload16(0, matrix_b + 0 * src_stride_y);

1117

const uchar16 b1_u8 = vload16(0, matrix_b + 1 * src_stride_y);

1118

const uchar16 b2_u8 = vload16(0, matrix_b + 2 * src_stride_y);

1119

const uchar16 b3_u8 = vload16(0, matrix_b + 3 * src_stride_y);

1120

1121

sum_col_u32 += convert_uint16(b0_u8) + convert_uint16(b1_u8) + convert_uint16(b2_u8) + convert_uint16(b3_u8);

1122

1123

matrix_b += 4 * src_stride_y;

1124

}

1125

1126

// This for loop perfoms the leftover accumulations

1127

for(; i < (int)ROWS_B; ++i)

1128

{

1129

const uchar16 b0_u8 = vload16(0, matrix_b);

1130

1131

sum_col_u32 += convert_uint16(b0_u8);

1132

1133

matrix_b += src_stride_y;

1134

}

1135

1136

vstore16(convert_int16(sum_col_u32), 0, (__global int *)dst.ptr);

1137

}

1138

#endif // defined(COLS_B) && defined(ROWS_B)

1139

1140

#if defined(K_OFFSET)

1141

/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place

1142

*

1143

* This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),

1144

* and adds to it the offset contribution of matrix A and matrix B in-place.

1145

*

1146

* @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)

1147

* @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)

1148

* @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

1149

* @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1150

*

1151

* The final result is:

1152

*

1153

* mm_result[i][k] = mm_result[i][k] +

1154

* (sum_col[k] * A_OFFSET) +

1155

* (sum_row[i] * B_OFFSET) +

1156

* (K_OFFSET)

1157

*

1158

* @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32

1159

* @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)

1160

* @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)

1161

* @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)

1162

* @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)

1163

* @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)

1164

* @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)

1165

* @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor

1166

* @param[in] sum_col_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

1167

* @param[in] sum_col_result_stride_x Stride of the source tensor in X dimension (in bytes)

1168

* @param[in] sum_col_result_step_x sum_col_stride_x * number of elements along X processed per workitem(in bytes)

1169

* @param[in] sum_col_result_stride_y Stride of the source tensor in Y dimension (in bytes)

1170

* @param[in] sum_col_result_step_y sum_col_stride_y * number of elements along Y processed per workitem(in bytes)

1171

* @param[in] sum_col_result_offset_first_element_in_bytes The offset of the first element in the source tensor

1172

* @param[in] sum_row_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

1173

* @param[in] sum_row_result_stride_x Stride of the source tensor in X dimension (in bytes)

1174

* @param[in] sum_row_result_step_x sum_row_stride_x * number of elements along X processed per workitem(in bytes)

1175

* @param[in] sum_row_result_stride_y Stride of the source tensor in Y dimension (in bytes)

1176

* @param[in] sum_row_result_step_y sum_row_stride_y * number of elements along Y processed per workitem(in bytes)

1177

* @param[in] sum_row_result_offset_first_element_in_bytes The offset of the first element in the source tensor

1178

*/

1179

__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)

1180

#if defined(A_OFFSET)

1181

,

1182

IMAGE_DECLARATION(sum_col)

1183

#endif // defined(A_OFFSET)

1184

#if defined(B_OFFSET)

1185

,

1186

IMAGE_DECLARATION(sum_row)

1187

#endif // defined(B_OFFSET)

1188

)

1189

{

1190

Tensor3D mm_result = CONVERT_TO_TENSOR3D_STRUCT(mm_result);

1191

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1192

int4 a_offset_s32 = (int4)0;

1193

int4 b_offset_s32 = (int4)0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1194

1195

#if defined(A_OFFSET)

1196

Image sum_col = CONVERT_TO_IMAGE_STRUCT(sum_col);

1197

1198

// Compute the offset contribution due to A_OFFSET

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

1199

#if defined(SUM_COL_HAS_BATCHES)

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1200

a_offset_s32 = vload4(0, (__global int *)(sum_col.ptr + get_global_id(2) * sum_col_stride_y));

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

1201

#else // defined(MATRIX_B_HAS_BATCHES)

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1202

a_offset_s32 = vload4(0, (__global int *)(sum_col.ptr));

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

1203

#endif // defined(MATRIX_B_HAS_BATCHES)

1204

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1205

a_offset_s32 *= (int4)A_OFFSET;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1206

#endif // defined(A_OFFSET)

1207

1208

#if defined(B_OFFSET)

1209

Image sum_row = CONVERT_TO_IMAGE_STRUCT(sum_row);

1210

1211

// Compute the offset contribution due to B_OFFSET

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1212

b_offset_s32 = (int4) * (((__global int *)(sum_row.ptr + get_global_id(2) * sum_row_stride_y)) + get_global_id(1));

1213

b_offset_s32 *= (int4)B_OFFSET;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1214

#endif // defined(B_OFFSET)

1215

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1216

const int4 offset_term_s32 = (int4)K_OFFSET + a_offset_s32 + b_offset_s32;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1217

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1218

int4 in_s32 = vload4(0, (__global int *)mm_result.ptr);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1219

1220

// Add the offset terms to GEMM's result

1221

in_s32 += offset_term_s32;

1222

1223

// Store the result with the offset contribution

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1224

vstore4(in_s32, 0, (__global int *)mm_result.ptr);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1225

}

1226

#endif // defined(K_OFFSET)

1227

1228

#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)

1229

/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8

1230

*

1231

* This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8 value.

1232

* The following computations will be performed by the kernel:

1233

*

1234

* -# Add offset terms to final result

1235

* -# Multiply each entry of result by result_mult_int

1236

* -# Add bias to final result (if -DADD_BIAS is passed at compile time)

1237

* -# Shift the int32 accumulator by result_shift

1238

* -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)

1239

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

1240

*

1241

* @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT

1242

*

1243

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

1244

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

1245

* These values can be used to implement "rectified linear unit" activation functions

1246

*

1247

* @param[in] src_ptr Pointer to the source tensor. Supported data type: S32

1248

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1249

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1250

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1251

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1252

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1253

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1254

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1255

* @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr

1256

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

1257

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

1258

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

1259

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

1260

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1261

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1262

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1263

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1264

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1265

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1266

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1267

*/

1268

__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),

1269

#if defined(ADD_BIAS)

1270

VECTOR_DECLARATION(biases),

1271

#endif // defined(ADD_BIAS)

1272

TENSOR3D_DECLARATION(dst))

1273

{

1274

// Compute source and destination addresses

1275

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

1276

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

1277

#if defined(ADD_BIAS)

1278

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

1279

#endif // defined(ADD_BIAS)

1280

1281

int16 input_values = vload16(0, (__global int *)src.ptr);

1282

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

1283

// Add the offset terms to GEMM's result

1284

input_values += (int16)RESULT_OFFSET;

1285

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1286

#if defined(ADD_BIAS)

1287

// Add bias

1288

const int16 biases_values = vload16(0, (__global int *)biases.ptr);

1289

input_values += (int16)biases_values;

1290

#endif // defined(ADD_BIAS)

1291

Georgios Pinitas

45bcc3a

2017-11-29 11:06:49 +0000

[diff] [blame]

1292

// Multiply by result_mult_int and shift

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

1293

input_values *= RESULT_MULT_INT;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1294

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

1295

input_values >>= RESULT_SHIFT;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1296

1297

uchar16 res = convert_uchar16_sat(input_values);

1298

1299

#if defined(MIN_BOUND)

1300

res = max(res, (uchar16)MIN_BOUND);

1301

#endif // defined(MIN_BOUND)

1302

#if defined(MAX_BOUND)

1303

res = min(res, (uchar16)MAX_BOUND);

1304

#endif // defined(MAX_BOUND)

1305

1306

// Store the result

1307

vstore16(res, 0, dst.ptr);

1308

}

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

1309

#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)

1310

1311

#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)

1312

/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8

1313

*

1314

* This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.

1315

* The following computations will be performed by the kernel:

1316

*

1317

* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier

1318

* -# Add bias to final result if bias tensor is not a nullptr

1319

* -# Round to nearest division by a power-of-two using result_shift

1320

* -# Add offset to each result

1321

* -# Clamp the value between the specified min and max bounds

1322

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

1323

*

1324

* @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT

1325

*

1326

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

1327

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

1328

* These values can be used to implement "rectified linear unit" activation functions

1329

*

1330

* @param[in] src_ptr Pointer to the source tensor. Supported data type: S32

1331

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1332

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1333

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1334

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1335

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1336

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1337

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1338

* @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr

1339

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

1340

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

1341

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

1342

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

1343

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1344

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1345

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1346

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1347

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1348

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1349

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1350

*/

1351

__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),

1352

#if defined(ADD_BIAS)

1353

VECTOR_DECLARATION(biases),

1354

#endif // defined(ADD_BIAS)

1355

TENSOR3D_DECLARATION(dst))

1356

{

1357

// Compute source and destination addresses

1358

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

1359

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

1360

#if defined(ADD_BIAS)

1361

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

1362

#endif // defined(ADD_BIAS)

1363

1364

int16 input_values = vload16(0, (__global int *)src.ptr);

1365

1366

#if defined(ADD_BIAS)

1367

// Add bias

1368

const int16 biases_values = vload16(0, (__global int *)biases.ptr);

1369

input_values += (int16)biases_values;

1370

#endif // defined(ADD_BIAS)

1371

1372

// Multiply by result_mult_int and shift

1373

input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 16);

1374

1375

// Add the offset terms to GEMM's result

1376

input_values += (int16)RESULT_OFFSET_AFTER_SHIFT;

1377

1378

uchar16 res = convert_uchar16_sat(input_values);

1379

1380

#if defined(MIN_BOUND)

1381

res = max(res, (uchar16)MIN_BOUND);

1382

#endif // defined(MIN_BOUND)

1383

#if defined(MAX_BOUND)

1384

res = min(res, (uchar16)MAX_BOUND);

1385

#endif // defined(MAX_BOUND)

1386

1387

// Store the result

1388

vstore16(res, 0, dst.ptr);

1389

}

Chunosov