Blame - src/core/CL/cl_kernels/gemv.cl - ml/ComputeLibrary

* @note Datatype and source width and height should be given as a preprocessor argument using -DDATA_TYPE=type, -DSRC_WIDTH=width and -DSRC_HEIGHT=height. e.g. -DDATA_TYPE=short

30

*

31

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32

32

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

33

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

34

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

35

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

36

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

37

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

38

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

Joel Liang

f1f3ebd

2017-11-10 09:59:19 +0800

[diff] [blame]

39

* @param[in] weights_ptr Pointer to the weights tensor. Same as @p src_ptr

Giorgio Arena

9fe4144

2017-08-23 16:36:24 +0100

[diff] [blame]

40

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

41

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

42

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

43

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

44

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

45

* @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr

46

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

47

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

48

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

49

*/

50

__kernel void gemm_mv(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(weights), VECTOR_DECLARATION(dst))

51

{

52

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

53

54

int y = get_global_id(1) * 4;

55

int z = get_global_id(2);

56

57

__global uchar *current_weights = weights_ptr + weights_offset_first_element_in_bytes + z * weights_stride_y;

58

__global uchar *input_ptr = src.ptr;

59

60

DATA_TYPE acc0 = (DATA_TYPE)0;

61

DATA_TYPE acc1 = (DATA_TYPE)0;

62

DATA_TYPE acc2 = (DATA_TYPE)0;

63

DATA_TYPE acc3 = (DATA_TYPE)0;

64

65

// This kernel handle 4 rows in per thread so that it can reuse the weights

66

for(int i = 0; i < SRC_WIDTH; i += 4)

67

{

68

VEC_DATA_TYPE(DATA_TYPE, 4)

69

weights = vload4(0, (__global DATA_TYPE *)(current_weights + i * weights_stride_x));

70

71

int4 offset = (int4)i * (int4)src_stride_x + (int4)(0, 1, 2, 3) * (int4)src_stride_y;

72

73

VEC_DATA_TYPE(DATA_TYPE, 4)

74

tmp0 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s0));

75

VEC_DATA_TYPE(DATA_TYPE, 4)

76

tmp1 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s1));

77

VEC_DATA_TYPE(DATA_TYPE, 4)

78

tmp2 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s2));

79

VEC_DATA_TYPE(DATA_TYPE, 4)

80

tmp3 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s3));

81

82

acc0 += dot(weights, tmp0);

83

acc1 += dot(weights, tmp1);

84

acc2 += dot(weights, tmp2);

85

acc3 += dot(weights, tmp3);

86

}

87

88

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y + z * SRC_HEIGHT) * dst_stride_x;

89

90

int rows_left = SRC_HEIGHT - (y + 4);

91

92

// This if check is used to handle the last few rows when it can't be divided by the four

93

if(rows_left >= 0)

94

{

95

VEC_DATA_TYPE(DATA_TYPE, 4)

96

out = (VEC_DATA_TYPE(DATA_TYPE, 4))(acc0, acc1, acc2, acc3);

97

vstore4(out, 0, (__global DATA_TYPE *)output_ptr);

}

else

{

switch(rows_left)

{

case -1: // three rows left; one is padding

104

*((__global DATA_TYPE *)(output_ptr + 2 * dst_stride_x)) = acc2;

105

case -2: // two rows left; two are padding

106

*((__global DATA_TYPE *)(output_ptr + 1 * dst_stride_x)) = acc1;

107

case -3: // one row left; three are padding

108

*((__global DATA_TYPE *)(output_ptr + 0 * dst_stride_x)) = acc0;

break;

}

}

}

Georgios Pinitas

de5a1cc

2018-02-02 12:52:07 +0000

[diff] [blame]

113

Georgios Pinitas

de5a1cc

2018-02-02 12:52:07 +0000

[diff] [blame]

114

/** This kernel applies dot product to each plane on the input tensor and the corresponding column of the reshaped weight tensor.

115

*

Michele Di Giorgio

cbbed28

2019-12-20 13:26:08 +0000

[diff] [blame]

116

* @note Input data type should be given as a preprocessor argument using -DDATA_TYPE=type, e.g. -DDATA_TYPE=uchar

117

*

118

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED

Georgios Pinitas

de5a1cc

2018-02-02 12:52:07 +0000

[diff] [blame]

119

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

120

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

121

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

122

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

123

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

124

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

125

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

Michele Di Giorgio

cbbed28

2019-12-20 13:26:08 +0000

[diff] [blame]

126

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

Georgios Pinitas

de5a1cc

2018-02-02 12:52:07 +0000

[diff] [blame]

127

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

128

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

129

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

130

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

131

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

Michele Di Giorgio

cbbed28

2019-12-20 13:26:08 +0000

[diff] [blame]

132

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: S32

Georgios Pinitas

de5a1cc

2018-02-02 12:52:07 +0000

[diff] [blame]

133

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

134

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

135

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

136

* @param[in] input_offset Input's quantization offset

137

* @param[in] weights_offset Weights's quantization offset

138

*/

139

__kernel void gemm_mv_quantized(TENSOR3D_DECLARATION(src),

140

IMAGE_DECLARATION(weights),

141

VECTOR_DECLARATION(dst),

142

const int input_offset,

143

const int weights_offset)

144

{

145

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

146

147

int y = get_global_id(1) * 4;

148

int z = get_global_id(2);

149

150

__global uchar *current_weights = weights_ptr + weights_offset_first_element_in_bytes + z * weights_stride_y;

151

__global uchar *input_ptr = src.ptr;

int acc0 = 0;

int acc1 = 0;

int acc2 = 0;

int acc3 = 0;

// This kernel handle 4 rows in per thread so that it can reuse the weights

159

for(int i = 0; i < SRC_WIDTH; i += 4)

160

{

Michele Di Giorgio

cbbed28

2019-12-20 13:26:08 +0000

[diff] [blame]

161

int4 w = convert_int4(vload4(0, (__global DATA_TYPE *)(current_weights + i * weights_stride_x))) + (int4)weights_offset;

Georgios Pinitas

de5a1cc

2018-02-02 12:52:07 +0000

[diff] [blame]

162

163

int4 offset = (int4)i * (int4)src_stride_x + (int4)(0, 1, 2, 3) * (int4)src_stride_y;

164

Michele Di Giorgio

cbbed28

2019-12-20 13:26:08 +0000

[diff] [blame]

165

int4 tmp0 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s0))) + (int4)input_offset;

166

int4 tmp1 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s1))) + (int4)input_offset;

167

int4 tmp2 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s2))) + (int4)input_offset;

168

int4 tmp3 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s3))) + (int4)input_offset;

Georgios Pinitas

de5a1cc

2018-02-02 12:52:07 +0000

[diff] [blame]

169

170

// Accumulate

171

acc0 += tmp0.s0 * w.s0 + tmp0.s1 * w.s1 + tmp0.s2 * w.s2 + tmp0.s3 * w.s3;

172

acc1 += tmp1.s0 * w.s0 + tmp1.s1 * w.s1 + tmp1.s2 * w.s2 + tmp1.s3 * w.s3;

173

acc2 += tmp2.s0 * w.s0 + tmp2.s1 * w.s1 + tmp2.s2 * w.s2 + tmp2.s3 * w.s3;

174

acc3 += tmp3.s0 * w.s0 + tmp3.s1 * w.s1 + tmp3.s2 * w.s2 + tmp3.s3 * w.s3;

175

}

176

177

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y + z * SRC_HEIGHT) * dst_stride_x;

178

179

int rows_left = SRC_HEIGHT - (y + 4);

180

181

// This if check is used to handle the last few rows when it can't be divided by the four

182

if(rows_left >= 0)

183

{

184

vstore4((int4)(acc0, acc1, acc2, acc3), 0, (__global int *)output_ptr);

}

else

{

switch(rows_left)

{

case -1: // three rows left; one is padding

191

*((__global int *)(output_ptr + 2 * dst_stride_x)) = acc2;

192

case -2: // two rows left; two are padding

193

*((__global int *)(output_ptr + 1 * dst_stride_x)) = acc1;

194

case -3: // one row left; three are padding

195

*((__global int *)(output_ptr + 0 * dst_stride_x)) = acc0;

break;

}

}

}

Michele Di Giorgio

cbbed28

2019-12-20 13:26:08 +0000

[diff] [blame]

200

#endif /* defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) */